Skip to content

Commit

Permalink
[SARC-387] FIX temporaire aux stats GPU-util erronnées (#140)
Browse files Browse the repository at this point in the history
Co-authored-by: Bruno Carrez <[email protected]>
  • Loading branch information
notoraptor and nurbal authored Nov 20, 2024
1 parent 6cd17ec commit 8a4ca09
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 1 deletion.
7 changes: 7 additions & 0 deletions sarc/jobs/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,13 @@ def load_job_series(
job_series = job.stored_statistics.dict()
job_series = {k: _select_stat(k, v) for k, v in job_series.items()}

# Replace `gpu_utilization > 1` with nan.
if (
job.stored_statistics.gpu_utilization
and job_series["gpu_utilization"] > 1
):
job_series["gpu_utilization"] = np.nan

# Flatten job.requested and job.allocated into job_series
job_series.update(
{f"requested.{key}": value for key, value in job.requested.dict().items()}
Expand Down
63 changes: 62 additions & 1 deletion tests/functional/jobs/test_func_load_job_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas
import pytest

from sarc.client.job import get_jobs
from sarc.client.job import JobStatistics, Statistics, get_jobs
from sarc.client.users.api import get_users
from sarc.config import MTL
from sarc.jobs.series import load_job_series
Expand Down Expand Up @@ -290,6 +290,67 @@ def test_load_job_series_with_stored_statistics(monkeypatch):
assert all(not math.isnan(value) for value in re_frame[label])


@pytest.mark.usefixtures("read_write_db", "tzlocal_is_mtl")
def test_load_job_series_with_bad_gpu_utilization(file_regression):
"""Check that gpu_utilization > 1 is replaced with nan in job series."""

# Check default situation: gpu_utilization is None
jobs = list(get_jobs())
frame = load_job_series()
assert jobs
for job in jobs:
assert not job.stored_statistics
assert all(math.isnan(value) for value in frame["gpu_utilization"])

# Save job statistics with gpu_utilization manually set.
for i, job in enumerate(jobs):
# Half of jobs will have gpu_utilization > 1, and should be set to nan in job series
job.stored_statistics = JobStatistics(
gpu_utilization=Statistics(
median=2 * (i + 1) / len(jobs),
mean=0,
std=0,
q05=0,
q25=0,
q75=0,
max=0,
unused=0,
)
)
job.save()

# Generate new data frame.
re_jobs = list(get_jobs())
re_frame = load_job_series()

# String representation for jobs
jobs_markdown = pandas.DataFrame(
{
"cluster_name": [job.cluster_name for job in re_jobs],
"job_id": [job.job_id for job in re_jobs],
"gpu_utilization": [
job.stored_statistics.gpu_utilization.median for job in re_jobs
],
}
).to_markdown()

# String representation for job series.
series_markdown = re_frame[
["cluster_name", "job_id", "gpu_utilization"]
].to_markdown()

# For jobs, we expect values in gpu_utilization column.
# For job series, we expect nan for any gpu_utilization > 1.
file_regression.check(
f"gpu_utilization:\n"
f"================\n\n"
f"Jobs:\n"
f"{jobs_markdown}\n\n"
f"Job series:\n"
f"{series_markdown}\n"
)


@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize("params", few_parameters.values(), ids=few_parameters.keys())
def test_load_job_series_fields_list(params, file_regression):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
gpu_utilization:
================

Jobs:
| | cluster_name | job_id | gpu_utilization |
|---:|:---------------|----------:|------------------:|
| 0 | raisin | 1 | 0.0833333 |
| 1 | raisin | 2 | 0.166667 |
| 2 | raisin | 3 | 0.25 |
| 3 | raisin | 4 | 0.333333 |
| 4 | raisin | 5 | 0.416667 |
| 5 | raisin | 6 | 0.5 |
| 6 | raisin | 7 | 0.583333 |
| 7 | raisin | 8 | 0.666667 |
| 8 | raisin | 9 | 0.75 |
| 9 | raisin | 10 | 0.833333 |
| 10 | raisin | 11 | 0.916667 |
| 11 | raisin | 12 | 1 |
| 12 | raisin | 13 | 1.08333 |
| 13 | raisin | 14 | 1.16667 |
| 14 | fromage | 15 | 1.25 |
| 15 | patate | 16 | 1.33333 |
| 16 | raisin | 17 | 1.41667 |
| 17 | raisin | 18 | 1.5 |
| 18 | mila | 19 | 1.58333 |
| 19 | raisin | 20 | 1.66667 |
| 20 | raisin | 1000000 | 1.75 |
| 21 | raisin | 1000000 | 1.83333 |
| 22 | raisin | 23 | 1.91667 |
| 23 | mila | 999999999 | 2 |

Job series:
| | cluster_name | job_id | gpu_utilization |
|---:|:---------------|----------:|------------------:|
| 0 | raisin | 1 | 0.0833333 |
| 1 | raisin | 2 | 0.166667 |
| 2 | raisin | 3 | 0.25 |
| 3 | raisin | 4 | 0.333333 |
| 4 | raisin | 5 | 0.416667 |
| 5 | raisin | 6 | 0.5 |
| 6 | raisin | 7 | 0.583333 |
| 7 | raisin | 8 | 0.666667 |
| 8 | raisin | 9 | 0.75 |
| 9 | raisin | 10 | 0.833333 |
| 10 | raisin | 11 | 0.916667 |
| 11 | raisin | 12 | 1 |
| 12 | raisin | 13 | nan |
| 13 | raisin | 14 | nan |
| 14 | fromage | 15 | nan |
| 15 | patate | 16 | nan |
| 16 | raisin | 17 | nan |
| 17 | raisin | 18 | nan |
| 18 | mila | 19 | nan |
| 19 | raisin | 20 | nan |
| 20 | raisin | 1000000 | nan |
| 21 | raisin | 1000000 | nan |
| 22 | raisin | 23 | nan |
| 23 | mila | 999999999 | nan |

0 comments on commit 8a4ca09

Please sign in to comment.