From 1fb796868bbe765421988e19e3a07d39ca1181dd Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Thu, 5 Dec 2024 13:26:32 +0100 Subject: [PATCH] redo logging (#415) - revamped the logging with a config modifyng every loggers used in lighteval - made accelerate a default requirement - fixed some documentation --- README.md | 24 +- docs/source/installation.mdx | 1 - pyproject.toml | 5 +- src/lighteval/__main__.py | 26 +++ src/lighteval/data.py | 9 +- src/lighteval/logging/evaluation_tracker.py | 24 +- src/lighteval/logging/hierarchical_logger.py | 166 -------------- src/lighteval/logging/info_loggers.py | 9 +- src/lighteval/main_accelerate.py | 1 + src/lighteval/metrics/imports/bert_scorer.py | 14 +- .../metrics/imports/data_stats_metric.py | 7 +- src/lighteval/metrics/imports/summac.py | 5 +- src/lighteval/metrics/llm_as_judge.py | 4 +- src/lighteval/metrics/metrics_corpus.py | 7 +- src/lighteval/metrics/metrics_sample.py | 23 +- src/lighteval/metrics/sample_preparator.py | 6 +- src/lighteval/metrics/stderr.py | 8 +- .../metrics/utils/linguistic_tokenizers.py | 7 +- src/lighteval/models/adapter_model.py | 10 +- src/lighteval/models/base_model.py | 35 +-- src/lighteval/models/delta_model.py | 11 +- src/lighteval/models/endpoint_model.py | 32 +-- src/lighteval/models/model_config.py | 10 +- src/lighteval/models/model_loader.py | 9 +- src/lighteval/models/nanotron_model.py | 9 +- src/lighteval/models/openai_model.py | 7 +- src/lighteval/models/vllm_model.py | 9 +- src/lighteval/pipeline.py | 210 +++++++++--------- src/lighteval/tasks/default_prompts.py | 11 +- src/lighteval/tasks/lighteval_task.py | 10 +- src/lighteval/tasks/prompt_manager.py | 7 +- src/lighteval/tasks/registry.py | 20 +- src/lighteval/utils/parallelism.py | 13 +- 33 files changed, 345 insertions(+), 404 deletions(-) delete mode 100644 src/lighteval/logging/hierarchical_logger.py diff --git a/README.md b/README.md index f554ed17..0ee4010c 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ Hub, S3, or locally. ## ⚑️ Installation ```bash -pip install lighteval[accelerate] +pip install lighteval ``` Lighteval allows for many extras when installing, see [here](https://github.com/huggingface/lighteval/wiki/Installation) for a complete list. @@ -71,20 +71,24 @@ huggingface-cli login Lighteval offers two main entry points for model evaluation: - -* `lighteval accelerate`: evaluate models on CPU or one or more GPUs using [πŸ€— - Accelerate](https://github.com/huggingface/accelerate). -* `lighteval nanotron`: evaluate models in distributed settings using [⚑️ - Nanotron](https://github.com/huggingface/nanotron). +- `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [πŸ€— + Accelerate](https://github.com/huggingface/accelerate) +- `lighteval nanotron`: evaluate models in distributed settings using [⚑️ + Nanotron](https://github.com/huggingface/nanotron) +- `lighteval vllm`: evaluate models on one or more GPUs using [πŸš€ + VLLM](https://github.com/vllm-project/vllm) +- `lighteval endpoint` + - `inference-endpoint`: evaluate models on one or more GPUs using [πŸ”— + Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated) + - `tgi`: evaluate models on one or more GPUs using [πŸ”— Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index) + - `openai`: evaluate models on one or more GPUs using [πŸ”— OpenAI API](https://platform.openai.com/) Here’s a quick command to evaluate using the Accelerate backend: ```shell lighteval accelerate \ - --model_args "pretrained=gpt2" \ - --tasks "leaderboard|truthfulqa:mc|0|0" \ - --override_batch_size 1 \ - --output_dir="./evals/" + "pretrained=gpt2" \ + "leaderboard|truthfulqa:mc|0|0" ``` ## πŸ™ Acknowledgements diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 39ac2b89..542c0975 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -25,7 +25,6 @@ appropriate extras group. | extra name | description | |--------------|---------------------------------------------------------------------------| -| accelerate | To use accelerate for model and data parallelism with transformers models | | tgi | To use Text Generation Inference API to evaluate your model | | nanotron | To evaluate nanotron models | | quantization | To evaluate quantized models | diff --git a/pyproject.toml b/pyproject.toml index 1a99b6a6..9a4d3a3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ keywords = ["evaluation", "nlp", "llm"] dependencies = [ # Base dependencies "transformers>=4.38.0", + "accelerate", "huggingface_hub>=0.23.0", "torch>=2.0,<2.5", "GitPython>=3.1.41", # for logging @@ -64,7 +65,8 @@ dependencies = [ "typer", "termcolor==2.3.0", "pytablewriter", - "colorama", + "rich", + "colorlog", # Extension of metrics "aenum==3.1.15", # Base metrics @@ -80,7 +82,6 @@ dependencies = [ ] [project.optional-dependencies] -accelerate = ["accelerate"] tgi = ["text-generation==0.6.0"] optimum = ["optimum==1.12.0"] quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"] diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index c715723d..4484f781 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -19,7 +19,10 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging +from logging.config import dictConfig +import colorlog import typer import lighteval.main_accelerate @@ -32,6 +35,29 @@ app = typer.Typer() +logging_config = dict( # noqa C408 + version=1, + formatters={ + "c": { + "()": colorlog.ColoredFormatter, + "format": "[%(asctime)s] [%(log_color)s%(levelname)8s%(reset)s]: %(message)s (%(filename)s:%(lineno)s)", + "log_colors": { + "DEBUG": "cyan", + "INFO": "green", + "WARNING": "yellow", + "ERROR": "red", + "CRITICAL": "red,bg_white", + }, + }, + }, + handlers={"h": {"class": "logging.StreamHandler", "formatter": "c", "level": logging.INFO}}, + root={ + "handlers": ["h"], + "level": logging.INFO, + }, +) + +dictConfig(logging_config) app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate) app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline) diff --git a/src/lighteval/data.py b/src/lighteval/data.py index 74dedf22..7cb105e6 100644 --- a/src/lighteval/data.py +++ b/src/lighteval/data.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging import math from typing import Iterator, Tuple @@ -27,7 +28,6 @@ from torch.utils.data import Dataset from torch.utils.data.distributed import DistributedSampler, T_co -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.tasks.requests import ( GreedyUntilRequest, LoglikelihoodRequest, @@ -37,6 +37,9 @@ ) +logger = logging.getLogger(__name__) + + class DynamicBatchDataset(Dataset): def __init__( self, @@ -76,7 +79,7 @@ def __init__( def init_split_limits(self, num_dataset_splits): if num_dataset_splits >= self.total_size: - hlog_warn( + logger.warning( f"num_dataset_splits ({num_dataset_splits}) >= total_size ({self.total_size}), setting num_dataset_splits to 1" ) num_dataset_splits = 1 @@ -247,7 +250,7 @@ def init_split_limits(self, num_dataset_splits): _type_: _description_ """ if num_dataset_splits is not None: - hlog_warn( + logger.warning( "You cannot select the number of dataset splits for a generative evaluation at the moment. Automatically inferring." ) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 58ae7410..01705534 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -22,6 +22,7 @@ import copy import json +import logging import os import re import time @@ -37,7 +38,6 @@ from fsspec import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HFSummaryWriter, hf_hub_url -from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.logging.info_loggers import ( DetailsLogger, GeneralConfigLogger, @@ -49,6 +49,8 @@ from lighteval.utils.utils import obj_to_markdown +logger = logging.getLogger(__name__) + if is_nanotron_available(): from nanotron.config import GeneralArgs # type: ignore @@ -147,7 +149,7 @@ def __init__( def save(self) -> None: """Saves the experiment information and results to files, and to the hub if requested.""" - hlog("Saving experiment tracker") + logger.info("Saving experiment tracker") date_id = datetime.now().isoformat().replace(":", "-") # We first prepare data to save @@ -202,7 +204,7 @@ def save_results(self, date_id: str, results_dict: dict): output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name self.fs.mkdirs(output_dir_results, exist_ok=True) output_results_file = output_dir_results / f"results_{date_id}.json" - hlog(f"Saving results to {output_results_file}") + logger.info(f"Saving results to {output_results_file}") with self.fs.open(output_results_file, "w") as f: f.write(json.dumps(results_dict, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False)) @@ -210,7 +212,7 @@ def save_details(self, date_id: str, details_datasets: dict[str, Dataset]): output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name output_dir_details_sub_folder = output_dir_details / date_id self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True) - hlog(f"Saving details to {output_dir_details_sub_folder}") + logger.info(f"Saving details to {output_dir_details_sub_folder}") for task_name, dataset in details_datasets.items(): output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet" with self.fs.open(str(output_file_details), "wb") as f: @@ -255,7 +257,7 @@ def push_to_hub( if not self.api.repo_exists(repo_id): self.api.create_repo(repo_id, private=not (self.public), repo_type="dataset", exist_ok=True) - hlog(f"Repository {repo_id} not found, creating it.") + logger.info(f"Repository {repo_id} not found, creating it.") # We upload it both as a json and a parquet file result_file_base_name = f"results_{date_id}" @@ -490,11 +492,11 @@ def push_to_tensorboard( # noqa: C901 self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail] ): if not is_tensorboardX_available: - hlog_warn(NO_TENSORBOARDX_WARN_MSG) + logger.warning(NO_TENSORBOARDX_WARN_MSG) return if not is_nanotron_available(): - hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping") + logger.warning("You cannot push results to tensorboard without having nanotron installed. Skipping") return prefix = self.tensorboard_metric_prefix @@ -526,14 +528,14 @@ def push_to_tensorboard( # noqa: C901 bench_suite = None if ":" in task_name: bench_suite = task_name.split(":")[0] # e.g. MMLU - hlog(f"bench_suite {bench_suite} in {task_name}") + logger.info(f"bench_suite {bench_suite} in {task_name}") for metric, value in values.items(): if "stderr" in metric: continue if bench_suite not in bench_averages: bench_averages[bench_suite] = {} bench_averages[bench_suite][metric] = bench_averages[bench_suite].get(metric, []) + [float(value)] - hlog(f"Pushing {task_name} {values} to tensorboard") + logger.info(f"Pushing {task_name} {values} to tensorboard") for metric, value in values.items(): if "stderr" in metric: tb_context.add_scalar(f"stderr_{prefix}/{task_name}/{metric}", value, global_step=global_step) @@ -546,7 +548,7 @@ def push_to_tensorboard( # noqa: C901 # Tasks with subtasks for name, values in bench_averages.items(): for metric, values in values.items(): - hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard") + logger.info(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard") tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step) tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step) @@ -571,7 +573,7 @@ def push_to_tensorboard( # noqa: C901 # Now we can push to the hub tb_context.scheduler.trigger() - hlog( + logger.info( f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard" f" at global_step {global_step}" ) diff --git a/src/lighteval/logging/hierarchical_logger.py b/src/lighteval/logging/hierarchical_logger.py deleted file mode 100644 index ac8d59d8..00000000 --- a/src/lighteval/logging/hierarchical_logger.py +++ /dev/null @@ -1,166 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -import sys -import time -from datetime import timedelta -from logging import Logger -from typing import Any, Callable - -from colorama import Fore, Style - - -logger = Logger(__name__, level="INFO") - - -class HierarchicalLogger: - """ - Tracks the execution flow of the code as blocks, along with how long we're spending in each block. - Should not be called on its own, use [`hlog`], [`hlog_warn`] and [`hlog_err`] to log things, and - [`htrack_block`] to start a block level log step, or [`htrack`] to start a function level log step. - """ - - def __init__(self) -> None: - self.start_times: list[float] = [] - - def indent(self) -> str: - """Manages the block level text indentation for nested blocks""" - return " " * len(self.start_times) - - def track_begin(self, x: Any) -> None: - """Starts a block level tracker, stores the step begin time""" - logger.warning(f"{self.indent()}{str(x)} \u007b") # \u007b is { - sys.stdout.flush() - self.start_times.append(time.time()) - - def track_end(self) -> None: - """Ends a block level tracker, prints the elapsed time for the associated step""" - duration = time.time() - self.start_times.pop() - logger.warning(f"{self.indent()}\u007d [{str(timedelta(seconds=duration))}]") # \u007d is } - sys.stdout.flush() - - def log(self, x: Any) -> None: - logger.warning(self.indent() + str(x)) - sys.stdout.flush() - - -HIERARCHICAL_LOGGER = HierarchicalLogger() -BACKUP_LOGGER = Logger(__name__, level="INFO") - - -# Exposed public methods -def hlog(x: Any) -> None: - """Info logger. - - Logs a string version of x through the singleton [`HierarchicalLogger`]. - """ - try: - HIERARCHICAL_LOGGER.log(x) - except RuntimeError: - BACKUP_LOGGER.warning(x) - - -def hlog_warn(x: Any) -> None: - """Warning logger. - - Logs a string version of x, which will appear in a yellow color, through the singleton [`HierarchicalLogger`]. - """ - try: - HIERARCHICAL_LOGGER.log(Fore.YELLOW + str(x) + Style.RESET_ALL) - except RuntimeError: - BACKUP_LOGGER.warning(Fore.YELLOW + str(x) + Style.RESET_ALL) - - -def hlog_err(x: Any) -> None: - """Error logger. - - Logs a string version of x, which will appear in a red color, through the singleton [`HierarchicalLogger`]. - """ - try: - HIERARCHICAL_LOGGER.log(Fore.RED + str(x) + Style.RESET_ALL) - except RuntimeError: - BACKUP_LOGGER.warning(Fore.RED + str(x) + Style.RESET_ALL) - - -class htrack_block: - """ - Block annotator: hierarchical logging block, which encapsulate the current step's logs and duration. - - Usage: - with htrack_block('Step'): - hlog('current logs') - - Output: - Step { - current logs - } [0s] - """ - - def __init__(self, x: Any) -> None: - self.x = x - - def __enter__(self) -> None: - HIERARCHICAL_LOGGER.track_begin(self.x) - - def __exit__(self, tpe: Any, value: Any, callback: Any) -> None: - HIERARCHICAL_LOGGER.track_end() - - -class htrack: - """ - Function annotator: prints called function parameters, then opens an hierarchical [`htrack_block`] - which encapsulate the current step's logs and duration. - - Usage: - @htrack() - def function(args): - with htrack_block('Step'): - hlog('current logs') - - Output: - function: args, { - Step { - current logs - } [0s] - } - """ - - def __call__(self, fn: Callable) -> Any: - def wrapper(*args, **kwargs): # type:ignore - # Parent name to prepend - if len(args) > 0 and hasattr(args[0], fn.__name__): - parent = type(args[0]).__name__ + "." - else: - parent = "" - - args_list = "" - if len(args) > 0 or len(kwargs) > 0: - args_list = ": " - for v in enumerate(args): - args_list += f"{str(v)}, " - for k, v in kwargs.items(): - args_list += f"{str(k)}: {str(v)}, " - - with htrack_block(parent + fn.__name__ + args_list): - return fn(*args, **kwargs) - - return wrapper diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index b96a875d..46d3ab5c 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -21,6 +21,7 @@ # SOFTWARE. import collections +import logging import os import time from dataclasses import asdict, dataclass, field @@ -30,7 +31,6 @@ import numpy as np import xxhash -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.metrics import MetricCategory from lighteval.metrics.stderr import get_stderr_function from lighteval.models.abstract_model import ModelInfo @@ -41,6 +41,9 @@ from lighteval.utils.utils import as_list, sanitize_numpy +logger = logging.getLogger(__name__) + + if is_nanotron_available(): from nanotron.config import Config @@ -507,7 +510,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = try: metric_result = task.aggregation()[metric_name](metric_values) except OverflowError: - hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.") + logger.warning(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.") metric_result = float("nan") except KeyError: continue @@ -529,7 +532,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int = except OverflowError: # Is this need or should we just pass? self.metric_aggregated[task_name][f"{metric_name}_stderr"] = float("nan") - hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.") + logger.warning(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.") # We group subtasks which belong to the same parent task, like MMLU, to compute an average on them # and compute an average of all metrics diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index f6ed6b38..e7d18c80 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -114,6 +114,7 @@ def accelerate( # noqa C901 cache_dir = CACHE_DIR env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir) + evaluation_tracker = EvaluationTracker( output_dir=output_dir, save_details=save_details, diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py index 347dcc02..1012bc3f 100644 --- a/src/lighteval/metrics/imports/bert_scorer.py +++ b/src/lighteval/metrics/imports/bert_scorer.py @@ -22,6 +22,7 @@ # SOFTWARE. """Simplified version of the BertScorer lib - we only import what we need.""" +import logging import os import time from collections import defaultdict @@ -31,7 +32,8 @@ from torch.nn.utils.rnn import pad_sequence from transformers import AutoModel, AutoTokenizer -from lighteval.logging.hierarchical_logger import hlog, hlog_warn + +logger = logging.getLogger(__name__) def padding(arr, pad_token, dtype=torch.long): @@ -218,14 +220,14 @@ def greedy_cos_idf( F = F.view(L, B) if torch.any(hyp_zero_mask): - hlog_warn( + logger.warning( "Warning: Empty candidate sentence detected; setting raw BERTscores to 0.", ) P = P.masked_fill(hyp_zero_mask, 0.0) R = R.masked_fill(hyp_zero_mask, 0.0) if torch.any(ref_zero_mask): - hlog_warn("Warning: Empty reference sentence detected; setting raw BERTScores to 0.") + logger.warning("Empty reference sentence detected; setting raw BERTScores to 0.") P = P.masked_fill(ref_zero_mask, 0.0) R = R.masked_fill(ref_zero_mask, 0.0) @@ -441,7 +443,7 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False): """ if self._model is None: - hlog(f"Loading BERTScorer model `{self._model_type}`") + logger.info(f"Loading BERTScorer model `{self._model_type}`") self._tokenizer = AutoTokenizer.from_pretrained(self._model_type) self._model = AutoModel.from_pretrained(self._model_type) self._model.eval() @@ -460,7 +462,7 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False): count += len(ref_group) if verbose: - hlog("calculating scores...") + logger.info("calculating scores...") start = time.perf_counter() if self.idf: @@ -496,6 +498,6 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False): if verbose: time_diff = time.perf_counter() - start - hlog(f"done in {time_diff:.2f} seconds, {len(refs) / time_diff:.2f} sentences/sec") + logger.info(f"done in {time_diff:.2f} seconds, {len(refs) / time_diff:.2f} sentences/sec") return out diff --git a/src/lighteval/metrics/imports/data_stats_metric.py b/src/lighteval/metrics/imports/data_stats_metric.py index 9982e5da..119e6cad 100644 --- a/src/lighteval/metrics/imports/data_stats_metric.py +++ b/src/lighteval/metrics/imports/data_stats_metric.py @@ -24,15 +24,18 @@ # pylint: disable=C0103,W0221,W0106 # Replace summ_eval.data_stats_metric +import logging from collections import Counter from multiprocessing import Pool import spacy -from lighteval.logging.hierarchical_logger import hlog from lighteval.metrics.imports.data_stats_utils import Fragments +logger = logging.getLogger(__name__) + + _en = None @@ -78,7 +81,7 @@ def __init__(self, n_gram=3, n_workers=24, case=False, tokenize=True): try: _en = spacy.load("en_core_web_sm") except OSError: - hlog("Downloading the spacy en_core_web_sm model\n" "(don't worry, this will only happen once)") + logger.info("Downloading the spacy en_core_web_sm model\n(don't worry, this will only happen once)") from spacy.cli import download download("en_core_web_sm") diff --git a/src/lighteval/metrics/imports/summac.py b/src/lighteval/metrics/imports/summac.py index 2803ba59..e64dab86 100644 --- a/src/lighteval/metrics/imports/summac.py +++ b/src/lighteval/metrics/imports/summac.py @@ -4,6 +4,7 @@ ############################################### import json +import logging import os import time @@ -13,8 +14,8 @@ import tqdm from transformers import AutoModelForSequenceClassification, AutoTokenizer -from lighteval.logging.hierarchical_logger import hlog +logger = logging.getLogger(__name__) # GPU-related business @@ -40,7 +41,7 @@ def wait_free_gpu(gb_needed): def select_freer_gpu(): freer_gpu = str(get_freer_gpu()) - hlog("Will use GPU: %s" % (freer_gpu)) + logger.info("Will use GPU: %s" % (freer_gpu)) os.environ["CUDA_LAUNCH_BLOCKING"] = "1" os.environ["CUDA_VISIBLE_DEVICES"] = "" + freer_gpu return freer_gpu diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py index a8f363d4..a4b5dfb1 100644 --- a/src/lighteval/metrics/llm_as_judge.py +++ b/src/lighteval/metrics/llm_as_judge.py @@ -28,12 +28,12 @@ from tqdm import tqdm -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.utils.imports import is_openai_available, is_vllm_available logging.getLogger("openai").setLevel(logging.ERROR) logging.getLogger("httpx").setLevel(logging.ERROR) +logger = logging.getLogger(__name__) class JudgeLM: @@ -211,6 +211,6 @@ def __call_api(self, prompt): text = response.choices[0].message.content return text except Exception as e: - hlog_warn(f"{type(e), e}") + logger.warning(f"{type(e), e}") time.sleep(self.API_RETRY_SLEEP) raise Exception("Failed to get response from the API") diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py index 1286ab08..03b1b2c5 100644 --- a/src/lighteval/metrics/metrics_corpus.py +++ b/src/lighteval/metrics/metrics_corpus.py @@ -24,13 +24,13 @@ Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus. A number of these aggregations come from the EleutherAIHarness """ +import logging import math import numpy as np import sacrebleu import sklearn.metrics -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.metrics.sample_preparator import ( GenerativeCorpusMetricInput, LogprobCorpusMetricInput, @@ -39,6 +39,9 @@ from lighteval.utils.utils import as_list +logger = logging.getLogger(__name__) + + # General aggregations def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float: """Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)). @@ -108,7 +111,7 @@ def compute(self, items: list[GenerativeCorpusMetricInput]) -> float: for i in items: pred = as_list(i.preds) if len(pred) > 1: - hlog_warn( + logger.info( f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{self.metric.__name__})." ) preds.append(pred[0]) diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py index 4fdc4293..2081b560 100644 --- a/src/lighteval/metrics/metrics_sample.py +++ b/src/lighteval/metrics/metrics_sample.py @@ -24,6 +24,7 @@ using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category. """ +import logging import os from typing import Callable, Literal @@ -34,10 +35,8 @@ from nltk.tokenize import word_tokenize from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.translate.bleu_score import sentence_bleu -from rouge_score import rouge_scorer, scoring from transformers import AutoModelForSequenceClassification, AutoTokenizer -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.metrics.imports.bert_scorer import BERTScorer from lighteval.metrics.imports.data_stats_metric import DataStatsMetric from lighteval.metrics.imports.summac import SummaCZS @@ -53,6 +52,9 @@ from lighteval.utils.utils import as_list, safe_divide +logger = logging.getLogger(__name__) + + class ExactMatches: def __init__( self, @@ -464,7 +466,7 @@ def __init__( default tokenizer will be used. """ if aggregation_function and bootstrap: - hlog_warn("Can't use both bootstrapping and an aggregation function in Rouge. Keeping bootstrap.") + logger.warning("Can't use both bootstrapping and an aggregation function in Rouge. Keeping bootstrap.") self.aggregation_function = aggregation_function if self.aggregation_function is None: self.aggregation_function = np.mean @@ -474,11 +476,11 @@ def __init__( raise ValueError( f"Rouge was initialised with method {methods}, which is not in {','.join(self.ALLOWED_ROUGE_METHODS)}" ) - self.scorer = rouge_scorer.RougeScorer([methods], tokenizer=tokenizer) self.multiple_golds = multiple_golds self.bootstrap = bootstrap self.normalize_gold = normalize_gold self.normalize_pred = normalize_pred + self.tokenizer = tokenizer def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float | dict: """Computes the metric(s) over a list of golds and predictions for one single sample. @@ -491,6 +493,11 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float | float or dict: Aggregated score over the current sample's items. If several rouge functions have been selected, returns a dict which maps name and scores. """ + from rouge_score import rouge_scorer + + if self.scorer is None: + self.scorer = rouge_scorer.RougeScorer(self.methods, tokenizer=self.tokenizer) + # Normalize if self.normalize_gold: golds = [self.normalize_gold(g) for g in golds] @@ -527,6 +534,8 @@ def _rouge_score_multi_golds(self, golds: list[str], preds: list[str]): return {method: self.aggregation_function(scores[method]) for method in self.methods} def _rouge_score_with_bootsrap(self, golds: list[str], preds: list[str]): + from rouge_score import scoring + aggregator = scoring.BootstrapAggregator() for g, p in zip(golds, preds): aggregator.add_scores(self.scorer.score(g, p)) @@ -575,7 +584,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict: dict: Scores over the current sample's items. """ if self.bert_scorer is None: - hlog_warn("The first metric computation step might be a bit longer as we need to download the model.") + logger.warning("The first metric computation step might be a bit longer as we need to download the model.") # We only initialize on first compute self.bert_scorer = BERTScorer( model_type="microsoft/deberta-large-mnli", lang="en", rescale_with_baseline=True, num_layers=9 @@ -787,7 +796,9 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict: dict: The different scores computed """ if len(golds) > 1: - hlog_warn("Provided more than one gold to compute a string distance metric. Just using the first one.") + logger.warning( + "Provided more than one gold to compute a string distance metric. Just using the first one." + ) reference = golds[0] result = {m: [] for m in self.metric_types} diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py index dc32d95c..4fafa509 100644 --- a/src/lighteval/metrics/sample_preparator.py +++ b/src/lighteval/metrics/sample_preparator.py @@ -20,12 +20,14 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging import re from dataclasses import asdict, dataclass import numpy as np -from lighteval.logging.hierarchical_logger import hlog_warn + +logger = logging.getLogger(__name__) @dataclass @@ -92,7 +94,7 @@ def prepare(self, gold_ixs: list[int], choices_logprob: list[float], **kwargs) - """ if self.is_single_token: if len(gold_ixs) > 1: - hlog_warn( + logger.warning( "The current sample has more than one gold available, which is unexpected. We selected only the first one for the corpus aggregation of the loglikelihood metric." ) return LogprobCorpusMetricInput(golds=gold_ixs[0], preds=np.argmax(choices_logprob)) diff --git a/src/lighteval/metrics/stderr.py b/src/lighteval/metrics/stderr.py index 388521a6..751e29e0 100644 --- a/src/lighteval/metrics/stderr.py +++ b/src/lighteval/metrics/stderr.py @@ -24,6 +24,7 @@ # We kept it because it's very fast - however, we renamed the variables # and added documentation +import logging import math import random from typing import Callable, Optional @@ -32,7 +33,8 @@ from scipy.stats import bootstrap from tqdm import tqdm -from lighteval.logging.hierarchical_logger import hlog + +logger = logging.getLogger(__name__) def _stddev(arr): @@ -78,7 +80,7 @@ def bootstrap_stderr(metric: Callable, population: list, number_experiments: int number_draws = min(1000, number_experiments) number_seeds = number_experiments // number_draws - hlog(f"Bootstrapping {metric.__name__}'s stderr with {number_seeds} seeds.") + logger.info(f"Bootstrapping {metric.__name__}'s stderr with {number_seeds} seeds.") for seed in range(number_seeds): # sample w replacement res.extend(_bootstrap_internal(metric=metric, number_draws=number_draws)((population, seed))) @@ -106,7 +108,7 @@ def bootstrap_stderr_scipy(metric: Callable, population: list, number_experiment Same as bootstrap_stderr, but uses scipy. It's kept for archive, as it overflows for big datasets """ - hlog(f"Bootstrapping {metric.__name__}'s stderr.") + logger.info(f"Bootstrapping {metric.__name__}'s stderr.") res = bootstrap( data=[population], statistic=metric, diff --git a/src/lighteval/metrics/utils/linguistic_tokenizers.py b/src/lighteval/metrics/utils/linguistic_tokenizers.py index 3bc84eab..e0dd9ef1 100644 --- a/src/lighteval/metrics/utils/linguistic_tokenizers.py +++ b/src/lighteval/metrics/utils/linguistic_tokenizers.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging from abc import ABC, abstractmethod from functools import lru_cache from typing import Callable, Iterator -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.utils.imports import ( NO_SPACY_TOKENIZER_ERROR_MSG, NO_STANZA_TOKENIZER_ERROR_MSG, @@ -26,6 +26,9 @@ from lighteval.utils.language import Language +logger = logging.getLogger(__name__) + + # Copy of https://github.com/huggingface/datatrove/blob/main/src/datatrove/utils/tokenization.py def strip_strings(els: list[str]) -> list[str]: return [el.strip() for el in els if len(el.strip()) > 0] @@ -270,6 +273,6 @@ def span_tokenize(self, text: str) -> list[tuple[int, int]]: def get_word_tokenizer(language: Language) -> WordTokenizer: tokenizer = TOKENIZER_FACTORY.get(language) if tokenizer is None: - hlog_warn(f"No word tokenizer found for language {language}, will split on spaces.") + logger.warning(f"No word tokenizer found for language {language}, will split on spaces.") return WhitespaceTokenizer() return tokenizer() diff --git a/src/lighteval/models/adapter_model.py b/src/lighteval/models/adapter_model.py index dbf762d7..24de80f4 100644 --- a/src/lighteval/models/adapter_model.py +++ b/src/lighteval/models/adapter_model.py @@ -20,12 +20,12 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging from contextlib import nullcontext import torch from transformers import AutoModelForCausalLM, PreTrainedTokenizer -from lighteval.logging.hierarchical_logger import hlog from lighteval.models.base_model import BaseModel from lighteval.models.model_config import AdapterModelConfig from lighteval.models.utils import _get_dtype @@ -33,6 +33,8 @@ from lighteval.utils.utils import EnvConfig +logger = logging.getLogger(__name__) + if is_peft_available(): from peft import PeftModel @@ -60,7 +62,7 @@ def _create_auto_model(self, config: AdapterModelConfig, env_config: EnvConfig) merged_path = f"{adapter_weights}-adapter-applied" if self.accelerator.is_local_main_process if self.accelerator is not None else nullcontext(): - hlog(f"Loading model from {adapter_weights} and applying adapter to {config.base_model}") + logger.info(f"Loading model from {adapter_weights} and applying adapter to {config.base_model}") base = AutoModelForCausalLM.from_pretrained( config.base_model, torch_dtype=torch.float16, low_cpu_mem_usage=True, token=env_config.token ) @@ -68,10 +70,10 @@ def _create_auto_model(self, config: AdapterModelConfig, env_config: EnvConfig) model = PeftModel.from_pretrained(base, adapter_weights) model = model.merge_and_unload() - hlog("Saving model with adapter applied") + logger.info("Saving model with adapter applied") base.save_pretrained(merged_path) - hlog(f"Loading model from {merged_path}") + logger.info(f"Loading model from {merged_path}") model = AutoModelForCausalLM.from_pretrained( merged_path, diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index 993978d5..fedc56a5 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging import os from typing import Optional, Tuple, Union @@ -33,7 +34,6 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset, LoglikelihoodSingleTokenDataset -from lighteval.logging.hierarchical_logger import hlog, hlog_err, hlog_warn from lighteval.models.abstract_model import LightevalModel, ModelInfo from lighteval.models.model_config import BaseModelConfig from lighteval.models.model_output import ( @@ -57,6 +57,9 @@ from lighteval.utils.utils import EnvConfig, as_list +logger = logging.getLogger(__name__) + + if is_accelerate_available(): from accelerate import Accelerator from accelerate.utils import calculate_maximum_sizes, convert_bytes, get_max_memory @@ -91,10 +94,10 @@ def __init__( # We are in DP (and launch the script with `accelerate launch`) if not config.model_parallel and not isinstance(config.quantization_config, BitsAndBytesConfig): - hlog(f"Using Data Parallelism, putting model on device {self._device}") + logger.info(f"Using Data Parallelism, putting model on device {self._device}") self.model = self.model.to(self._device) if config.compile: - hlog("Compiling the model") + logger.info("Compiling the model") self.model.model.compile() self.model_name = _simplify_name(config.pretrained) @@ -202,7 +205,7 @@ def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool, self.num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) self.num_machines = int(os.environ.get("WORLD_SIZE", 0)) // self.num_local_processes if self.num_machines == 0: - hlog("We are not in a distributed setting. Setting model_parallel to False.") + logger.info("We are not in a distributed setting. Setting model_parallel to False.") model_parallel = False if model_parallel is None: @@ -210,7 +213,7 @@ def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool, if "cpu" in max_memory_all_gpus: del max_memory_all_gpus["cpu"] model_parallel = bool(self.num_local_processes < len(max_memory_all_gpus)) - hlog( + logger.info( f"Setting model parallel to {model_parallel} since " f"the number of local processes is {self.num_local_processes} " f"and the number of GPUs is {len(max_memory_all_gpus)}" @@ -225,13 +228,13 @@ def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool, if k % self.num_local_processes == (self.accelerator.process_index % self.num_local_processes) } device_map = "auto" - hlog( + logger.info( f"Model parallel was set to True, setting max memory per GPU to {max_mem_this_process} and device map to {device_map}" ) else: max_mem_this_process = None device_map = None - hlog( + logger.info( f"Model parallel was set to False, max memory set to {max_mem_this_process} and device map to {device_map}" ) return model_parallel, max_mem_this_process, device_map @@ -332,7 +335,9 @@ def _create_auto_tokenizer_with_name( truncation_side="left", ) except FileNotFoundError: - hlog_warn("Problem when loading the tokenizer in the cache - discarding the provided cache path value.") + logger.warning( + "Problem when loading the tokenizer in the cache - discarding the provided cache path value." + ) tokenizer = AutoTokenizer.from_pretrained( model_name if tokenizer_name is None else tokenizer_name, revision=revision + (f"/{subfolder}" if subfolder is not None else ""), @@ -343,7 +348,7 @@ def _create_auto_tokenizer_with_name( ) tokenizer.pad_token = tokenizer.eos_token tokenizer.model_max_length = self.max_length - hlog("Tokenizer truncation and padding size set to the left side.") + logger.info("Tokenizer truncation and padding size set to the left side.") return tokenizer @@ -409,7 +414,7 @@ def _model_call(self, inputs: torch.Tensor) -> torch.Tensor: def _get_batch_size(self, max_input_length: int, override_bs: int = 0, starting_batch_size: int = 512) -> int: if override_bs > 0: return override_bs - hlog(f"Detecting largest batch size with max_input_length={max_input_length}") + logger.info(f"Detecting largest batch size with max_input_length={max_input_length}") @find_executable_batch_size( starting_batch_size=starting_batch_size @@ -422,7 +427,7 @@ def forward_batch(batch_size): return batch_size batch_size = forward_batch() - hlog(f"Determined largest batch size: {batch_size}") + logger.info(f"Determined largest batch size: {batch_size}") return batch_size def greedy_until_multi_turn( # noqa: C901 @@ -440,7 +445,7 @@ def greedy_until_multi_turn( # noqa: C901 if self.accelerator: dataloader = self.accelerator.prepare(dataloader) - hlog_warn("Running greedy multi turn generation, the batch size is set to 1 for this task.") + logger.warning("Running greedy multi turn generation, the batch size is set to 1 for this task.") for request_batch in tqdm( dataloader, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm @@ -650,7 +655,7 @@ def greedy_until( # should have been managed by the prompt creator/few shot manager if requested by the user. context_size = tokenized["input_ids"].shape[1] if context_size > self.max_length: - hlog_warn( + logger.warning( f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in" + str({i.task_name for i in batch}) + ". This is likely to lead to some errors." # noqa C401 @@ -949,7 +954,7 @@ def prepare_batch_logprob( padded = [] if max_context is None: - hlog_warn("max_context is None, using max_length") + logger.warning("max_context is None, using max_length") max_context = self.max_length # Each sample is concatenated and cut to length or padded to max_length @@ -964,7 +969,7 @@ def prepare_batch_logprob( padding_length = padding_length if padding_length is not None else sequence_len if padding_length - sequence_len < 0: - hlog_err(f"Padding length {padding_length} is smaller than input length {sequence_len}") + logger.warning(f"Padding length {padding_length} is smaller than input length {sequence_len}") raise ValueError("Negative padding") padded.append(padding_length - sequence_len) diff --git a/src/lighteval/models/delta_model.py b/src/lighteval/models/delta_model.py index 69fba37a..9aa8c01d 100644 --- a/src/lighteval/models/delta_model.py +++ b/src/lighteval/models/delta_model.py @@ -20,19 +20,22 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging from contextlib import nullcontext import torch from tqdm import tqdm from transformers import AutoModelForCausalLM -from lighteval.logging.hierarchical_logger import hlog from lighteval.models.base_model import BaseModel from lighteval.models.model_config import DeltaModelConfig from lighteval.models.utils import _get_dtype from lighteval.utils.utils import EnvConfig +logger = logging.getLogger(__name__) + + class DeltaModel(BaseModel): def _create_auto_model( self, @@ -48,7 +51,7 @@ def _create_auto_model( merged_path = f"{delta_model}-delta-applied" if self.accelerator.is_main_process if self.accelerator is not None else nullcontext(): - hlog(f"Loading base and delta models from {config.base_model} and {delta_model}") + logger.info(f"Loading base and delta models from {config.base_model} and {delta_model}") base = AutoModelForCausalLM.from_pretrained( config.base_model, torch_dtype=torch.float16, low_cpu_mem_usage=True, token=env_config.token ) @@ -64,10 +67,10 @@ def _create_auto_model( assert name in delta.state_dict() param.data += delta.state_dict()[name] - hlog("Saving delta-applied model") + logger.info("Saving delta-applied model") base.save_pretrained(merged_path) - hlog(f"Loading delta-applied model from {delta_model}-delta-applied") + logger.info(f"Loading delta-applied model from {delta_model}-delta-applied") model = AutoModelForCausalLM.from_pretrained( merged_path, diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py index bc2c7eac..bd82f058 100644 --- a/src/lighteval/models/endpoint_model.py +++ b/src/lighteval/models/endpoint_model.py @@ -21,6 +21,7 @@ # SOFTWARE. import asyncio +import logging import re import time from typing import Coroutine, List, Optional, Union @@ -45,7 +46,6 @@ from transformers import AutoTokenizer from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset -from lighteval.logging.hierarchical_logger import hlog, hlog_err, hlog_warn from lighteval.models.abstract_model import LightevalModel, ModelInfo from lighteval.models.model_config import InferenceEndpointModelConfig, InferenceModelConfig from lighteval.models.model_output import GenerativeResponse, LoglikelihoodResponse, LoglikelihoodSingleTokenResponse @@ -58,6 +58,8 @@ from lighteval.utils.utils import EnvConfig, as_list +logger = logging.getLogger(__name__) + BATCH_SIZE = 50 MAX_TIME_FOR_SPINUP = 3600 @@ -117,7 +119,7 @@ def __init__( # noqa: C901 try: if self.endpoint is None: # Endpoint does not exist yet locally if not config.should_reuse_existing: # New endpoint - hlog("Creating endpoint.") + logger.info("Creating endpoint.") self.endpoint: InferenceEndpoint = create_inference_endpoint( name=endpoint_name, namespace=config.namespace, @@ -150,7 +152,7 @@ def __init__( # noqa: C901 }, ) else: # Endpoint exists - hlog("Reusing existing endpoint.") + logger.info("Reusing existing endpoint.") self.endpoint = get_inference_endpoint( name=endpoint_name, token=env_config.token, namespace=config.namespace ) @@ -158,13 +160,13 @@ def __init__( # noqa: C901 else: # Endpoint exists locally but either failed (and most likely it must be scaled up) if must_scaleup_endpoint: - hlog("Rescaling existing endpoint.") + logger.info("Rescaling existing endpoint.") self.endpoint.update(instance_size=instance_size, instance_type=instance_type) must_scaleup_endpoint = False # or we got a connection error, in which case we do nothing and just wait at the next step # Waits for the endpoint to be deployed - we could also check for the status in updating', 'pending', 'initializing' - hlog("Trying to deploy your endpoint. Please wait for 10 min.") + logger.info("Trying to deploy your endpoint. Please wait for 10 min.") self.endpoint.wait(timeout=600, refresh_every=60) # We wait for 10 min except InferenceEndpointError as e: instance_type, instance_size = InferenceEndpointModel.get_larger_hardware_suggestion( @@ -172,11 +174,13 @@ def __init__( # noqa: C901 ) must_scaleup_endpoint = True - hlog( + logger.info( f"Endpoint failed to start on current hardware with error {e}. Trying to autoscale to ({instance_type}, {instance_size})." ) except InferenceEndpointTimeoutError as e: - hlog_err("Endpoint did not start within 30 minutes, there was a timeout. Please inspect the logs.") + logger.error( + "Endpoint did not start within 30 minutes, there was a timeout. Please inspect the logs." + ) raise e except HfHubHTTPError as e: # The endpoint actually already exists, we'll spin it up instead of trying to create a new one @@ -185,20 +189,20 @@ def __init__( # noqa: C901 config.should_reuse_existing = True # Requested resources are not available elif "Bad Request: Compute instance not available yet" in str(e): - hlog_err( - "The hardware combination you are requesting does not seem to be available: ({instance_type}, {instance_size}, {config.region})." + logger.error( + f"The hardware combination you are requesting does not seem to be available: ({instance_type}, {instance_size}, {config.region})." ) raise e # User account does not have access to requested resources elif "Conflict: Quota exceeded" in str(e): raise e except ConnectionError as e: - hlog_err(f"Connection failed with error {e}. Retrying") + logger.error(f"Connection failed with error {e}. Retrying") if not self.endpoint.status == "running": raise Exception("Did not manage to start endpoint within the elapsed time and on suggested hardware.") - hlog("Endpoint successfully deployed!") + logger.info("Endpoint successfully deployed!") self.endpoint_name = config.endpoint_name self.name = self.endpoint.repository self.revision = self.endpoint.revision @@ -278,12 +282,12 @@ def cleanup(self): if self.endpoint is not None: if self.reuse_existing: self.endpoint.pause() - hlog_warn( + logger.warning( "Since your endpoint was existing before, we did not delete it, but paused it instead. You might want to delete it if you're done using it." ) else: self.endpoint.delete() - hlog_warn( + logger.warning( "We deleted the spinned up endpoint after using it. You'll need to create it again if you need to reuse it." ) @@ -425,7 +429,7 @@ def greedy_until( returns_logits = batch[0].use_logits num_samples = batch[0].num_samples if num_samples > 1: - hlog_err( + logger.error( "Inference endpoints does not allow sampling evaluations - this is likely to fail or provide problematic results" ) diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index 268e2a6f..1eda1e02 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -20,13 +20,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging from dataclasses import dataclass from typing import Dict, Optional, Union import torch from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig -from lighteval.logging.hierarchical_logger import hlog from lighteval.models.utils import _get_model_sha from lighteval.utils.imports import ( NO_AUTOGPTQ_ERROR_MSG, @@ -40,6 +40,8 @@ from lighteval.utils.utils import EnvConfig, boolstring_to_bool +logger = logging.getLogger(__name__) + if is_accelerate_available(): from accelerate import Accelerator @@ -120,11 +122,11 @@ def __post_init__(self): if self.multichoice_continuations_start_space is not None: if self.multichoice_continuations_start_space: - hlog( + logger.info( "You set `multichoice_continuations_start_space` to true. This will force multichoice continuations to use a starting space" ) else: - hlog( + logger.info( "You set `multichoice_continuations_start_space` to false. This will remove a leading space from multichoice continuations, if present." ) @@ -154,7 +156,7 @@ def _init_configs(self, model_name: str, env_config: EnvConfig) -> PretrainedCon # Gathering the model's automatic quantization config, if available try: model_auto_quantization_config = auto_config.quantization_config - hlog("An automatic quantization config was found in the model's config. Using it to load the model") + logger.info("An automatic quantization config was found in the model's config. Using it to load the model") except (AttributeError, KeyError): model_auto_quantization_config = None diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py index e783c86d..1a409746 100644 --- a/src/lighteval/models/model_loader.py +++ b/src/lighteval/models/model_loader.py @@ -20,9 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging from typing import Union -from lighteval.logging.hierarchical_logger import hlog from lighteval.models.adapter_model import AdapterModel from lighteval.models.base_model import BaseModel from lighteval.models.delta_model import DeltaModel @@ -52,6 +52,9 @@ from lighteval.utils.utils import EnvConfig +logger = logging.getLogger(__name__) + + def load_model( # noqa: C901 config: Union[ BaseModelConfig, @@ -104,7 +107,7 @@ def load_model_with_tgi(config: TGIModelConfig): if not is_tgi_available(): raise ImportError(NO_TGI_ERROR_MSG) - hlog(f"Load model from inference server: {config.inference_server_address}") + logger.info(f"Load model from inference server: {config.inference_server_address}") model = ModelClient( address=config.inference_server_address, auth_token=config.inference_server_auth, model_id=config.model_id ) @@ -121,7 +124,7 @@ def load_openai_model(config: OpenAIModelConfig, env_config: EnvConfig): def load_model_with_inference_endpoints(config: InferenceEndpointModelConfig, env_config: EnvConfig): - hlog("Spin up model using inference endpoint.") + logger.info("Spin up model using inference endpoint.") model = InferenceEndpointModel(config=config, env_config=env_config) return model diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py index ded1624f..21b60504 100644 --- a/src/lighteval/models/nanotron_model.py +++ b/src/lighteval/models/nanotron_model.py @@ -21,6 +21,7 @@ # SOFTWARE. # ruff: noqa: C901 +import logging import os import time from typing import List, Optional, Tuple, Type, Union @@ -41,7 +42,6 @@ LoglikelihoodDataset, LoglikelihoodSingleTokenDataset, ) -from lighteval.logging.hierarchical_logger import hlog_err, hlog_warn from lighteval.models.base_model import LightevalModel, ModelInfo from lighteval.models.model_output import ( Batch, @@ -59,6 +59,9 @@ from lighteval.utils.utils import EnvConfig, as_list +logger = logging.getLogger(__name__) + + os.environ["TOKENIZERS_PARALLELISM"] = "false" TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding] @@ -1186,7 +1189,7 @@ def greedy_until( returns_logits = batch[0].use_logits num_samples = batch[0].num_samples if num_samples > 1: - hlog_err( + logger.error( "Nonotron models does not allow sampling evaluations - this is likely to fail or provide problematic results" ) @@ -1210,7 +1213,7 @@ def greedy_until( # should have been managed by the prompt creator/few shot manager if requested by the user. context_size = tokenized["input_ids"].shape[1] if context_size > self.max_length: - hlog_warn( + logger.warning( f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in" + str({i.task_name for i in batch}) + ". This is likely to lead to some errors." # noqa C401 diff --git a/src/lighteval/models/openai_model.py b/src/lighteval/models/openai_model.py index f799a45a..12fbeb95 100644 --- a/src/lighteval/models/openai_model.py +++ b/src/lighteval/models/openai_model.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging import os import time from concurrent.futures import ThreadPoolExecutor @@ -28,7 +29,6 @@ from tqdm import tqdm from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.models.abstract_model import LightevalModel from lighteval.models.endpoint_model import ModelInfo from lighteval.models.model_output import ( @@ -45,6 +45,9 @@ from lighteval.utils.imports import is_openai_available +logger = logging.getLogger(__name__) + + if is_openai_available(): import logging @@ -90,7 +93,7 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, logit_b ) return response except Exception as e: - hlog_warn(f"{type(e), e}") + logger.warning(f"{type(e), e}") time.sleep(self.API_RETRY_SLEEP) self.API_RETRY_SLEEP = self.API_RETRY_SLEEP**self.API_RETRY_MULTIPLIER raise Exception("Failed to get response from the API") diff --git a/src/lighteval/models/vllm_model.py b/src/lighteval/models/vllm_model.py index dc242c60..ecfe8fd8 100644 --- a/src/lighteval/models/vllm_model.py +++ b/src/lighteval/models/vllm_model.py @@ -22,6 +22,7 @@ import gc import itertools +import logging import os from typing import Optional @@ -29,7 +30,6 @@ from tqdm import tqdm from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.models.abstract_model import LightevalModel, ModelInfo from lighteval.models.model_config import VLLMModelConfig from lighteval.models.model_output import ( @@ -45,6 +45,9 @@ from lighteval.utils.utils import EnvConfig, as_list +logger = logging.getLogger(__name__) + + if is_vllm_available(): import ray from more_itertools import distribute @@ -225,14 +228,14 @@ def greedy_until( # left truncate the inputs to the maximum length if max_new_tokens is not None: if context_size + max_new_tokens > self.max_length: - hlog_warn( + logger.warning( f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens." ) context_size = self.max_length - max_new_tokens inputs = [input[-context_size:] for input in inputs] else: if context_size > self.max_length: - hlog_warn( + logger.warning( f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens." ) context_size = self.max_length diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index e429e519..facecd8e 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -32,7 +32,6 @@ import numpy as np from lighteval.logging.evaluation_tracker import EvaluationTracker -from lighteval.logging.hierarchical_logger import hlog, htrack_block from lighteval.metrics.utils.metric_utils import MetricCategory from lighteval.models.model_loader import BaseModel, load_model from lighteval.models.model_output import ModelResponse @@ -65,6 +64,12 @@ from lighteval.models.nanotron_model import NanotronLightevalModel +import logging + + +logger = logging.getLogger(__name__) + + class ParallelismManager(Enum): ACCELERATE = auto() NANOTRON = auto() @@ -124,8 +129,8 @@ def __init__( self.pipeline_parameters = pipeline_parameters self.launcher_type = self.pipeline_parameters.launcher_type if self.pipeline_parameters.max_samples: - hlog( - "WARNING: --max_samples WAS SET. THESE NUMBERS ARE ONLY PARTIAL AND SHOULD NOT BE USED FOR COMPARISON UNLESS YOU KNOW WHAT YOU ARE DOING." + logger.warning( + "--max_samples WAS SET. THESE NUMBERS ARE ONLY PARTIAL AND SHOULD NOT BE USED FOR COMPARISON UNLESS YOU KNOW WHAT YOU ARE DOING." ) self.model_config = model_config @@ -141,93 +146,88 @@ def __init__( def _init_parallelism_manager(self): accelerator, parallel_context = None, None - with htrack_block("Test all gather"): - if self.launcher_type == ParallelismManager.ACCELERATE: - if not is_accelerate_available(): - raise ValueError("You are trying to launch an accelerate model, but accelerate is not installed") - accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) - test_all_gather(accelerator=accelerator) - elif self.launcher_type == ParallelismManager.NANOTRON: - if not is_nanotron_available(): - raise ValueError("You are trying to launch a nanotron model, but nanotron is not installed") - dist.initialize_torch_distributed() - parallel_context = ParallelContext( - tensor_parallel_size=self.model_config.lighteval_config.parallelism.tp, - pipeline_parallel_size=self.model_config.lighteval_config.parallelism.pp, - data_parallel_size=self.model_config.lighteval_config.parallelism.dp, - ) - test_all_gather(parallel_context=parallel_context) + if self.launcher_type == ParallelismManager.ACCELERATE: + if not is_accelerate_available(): + raise ValueError("You are trying to launch an accelerate model, but accelerate is not installed") + accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) + test_all_gather(accelerator=accelerator) + elif self.launcher_type == ParallelismManager.NANOTRON: + if not is_nanotron_available(): + raise ValueError("You are trying to launch a nanotron model, but nanotron is not installed") + dist.initialize_torch_distributed() + parallel_context = ParallelContext( + tensor_parallel_size=self.model_config.lighteval_config.parallelism.tp, + pipeline_parallel_size=self.model_config.lighteval_config.parallelism.pp, + data_parallel_size=self.model_config.lighteval_config.parallelism.dp, + ) + test_all_gather(parallel_context=parallel_context) - return accelerator, parallel_context + return accelerator, parallel_context def _init_model(self, model_config, model): - with htrack_block("Model loading"): - if model_config is not None: - if self.parallel_context: - return NanotronLightevalModel( - checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path) - if self.pipeline_parameters.nanotron_checkpoint_path - else "", - nanotron_config=self.model_config, - parallel_context=self.parallel_context, - debug_one_layer_model=False, - model_class=None, - env_config=self.pipeline_parameters.env_config, - ) - else: - return load_model(config=model_config, env_config=self.pipeline_parameters.env_config) - if isinstance(model, BaseModel): - return model - else: - return BaseModel.from_model( - model=model, - use_chat_template=self.pipeline_parameters.use_chat_template, + logger.info("--- LOADING MODEL ---") + if model_config is not None: + if self.parallel_context: + return NanotronLightevalModel( + checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path) + if self.pipeline_parameters.nanotron_checkpoint_path + else "", + nanotron_config=self.model_config, + parallel_context=self.parallel_context, + debug_one_layer_model=False, + model_class=None, env_config=self.pipeline_parameters.env_config, - accelerator=self.accelerator, ) + else: + return load_model(config=model_config, env_config=self.pipeline_parameters.env_config) + if isinstance(model, BaseModel): + return model + else: + return BaseModel.from_model( + model=model, + use_chat_template=self.pipeline_parameters.use_chat_template, + env_config=self.pipeline_parameters.env_config, + accelerator=self.accelerator, + ) def _init_tasks_and_requests(self, tasks: str): - with htrack_block("Tasks loading"): - with local_ranks_zero_first() if self.launcher_type == ParallelismManager.NANOTRON else nullcontext(): - registry = Registry( - cache_dir=self.pipeline_parameters.env_config.cache_dir, - custom_tasks=self.pipeline_parameters.custom_tasks_directory, - ) - task_names_list, fewshots_dict = taskinfo_selector(tasks, registry) - task_dict = registry.get_task_dict(task_names_list) - LightevalTask.load_datasets( - list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes - ) + with local_ranks_zero_first() if self.launcher_type == ParallelismManager.NANOTRON else nullcontext(): + logger.info("--- LOADING TASKS ---") + registry = Registry( + cache_dir=self.pipeline_parameters.env_config.cache_dir, + custom_tasks=self.pipeline_parameters.custom_tasks_directory, + ) + task_names_list, fewshots_dict = taskinfo_selector(tasks, registry) + task_dict = registry.get_task_dict(task_names_list) + LightevalTask.load_datasets(list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes) - self.evaluation_tracker.task_config_logger.log(task_dict) - - hlog("Loading documents, and requests") - requests, docs = create_requests_from_tasks( - task_dict=task_dict, - fewshot_dict=fewshots_dict, - num_fewshot_seeds=self.pipeline_parameters.num_fewshot_seeds, - lm=self.model, - max_samples=self.pipeline_parameters.max_samples, - evaluation_tracker=self.evaluation_tracker, - use_chat_template=self.pipeline_parameters.use_chat_template, - system_prompt=self.pipeline_parameters.system_prompt, - ) + self.evaluation_tracker.task_config_logger.log(task_dict) + + requests, docs = create_requests_from_tasks( + task_dict=task_dict, + fewshot_dict=fewshots_dict, + num_fewshot_seeds=self.pipeline_parameters.num_fewshot_seeds, + lm=self.model, + max_samples=self.pipeline_parameters.max_samples, + evaluation_tracker=self.evaluation_tracker, + use_chat_template=self.pipeline_parameters.use_chat_template, + system_prompt=self.pipeline_parameters.system_prompt, + ) - self.task_names_list = task_names_list - self.task_dict = task_dict - self.fewshot_dict = fewshots_dict - self.requests = requests - self.docs = docs + self.task_names_list = task_names_list + self.task_dict = task_dict + self.fewshot_dict = fewshots_dict + self.requests = requests + self.docs = docs def _init_random_seeds(self): - with htrack_block("Setting seeds and waiting for all processes"): - hlog(f"setting seed to {1234} for random and numpy") - random.seed(1234) - np.random.seed(1234) - if self.accelerator is not None: - self.accelerator.wait_for_everyone() - if self.parallel_context is not None: - dist.barrier() + logger.info("--- INIT SEEDS ---") + random.seed(1234) + np.random.seed(1234) + if self.accelerator is not None: + self.accelerator.wait_for_everyone() + if self.parallel_context is not None: + dist.barrier() def is_main_process(self): if self.accelerator: @@ -237,43 +237,38 @@ def is_main_process(self): return True def evaluate(self): - with htrack_block("Evaluation"): - self.evaluation_tracker.general_config_logger.log_args_info( - num_fewshot_seeds=self.pipeline_parameters.num_fewshot_seeds, - override_batch_size=self.pipeline_parameters.override_batch_size, - max_samples=self.pipeline_parameters.max_samples, - job_id=self.pipeline_parameters.job_id, - config=self.model_config, - ) + self.evaluation_tracker.general_config_logger.log_args_info( + num_fewshot_seeds=self.pipeline_parameters.num_fewshot_seeds, + override_batch_size=self.pipeline_parameters.override_batch_size, + max_samples=self.pipeline_parameters.max_samples, + job_id=self.pipeline_parameters.job_id, + config=self.model_config, + ) - hlog(f"Evaluate on {len(self.task_names_list)} tasks.") - sample_id_to_responses = self._run_model() - self._compute_metrics(sample_id_to_responses) + sample_id_to_responses = self._run_model() + self._compute_metrics(sample_id_to_responses) if self.is_main_process(): - with htrack_block("Compiling results"): - self.evaluation_tracker.general_config_logger.log_end_time() - self.evaluation_tracker.metrics_logger.aggregate(task_dict=self.task_dict, bootstrap_iters=1000) - self.evaluation_tracker.details_logger.aggregate() - - with htrack_block("Cleaning up"): # For non nanotron models - for weights in ["delta", "adapter"]: - try: - tmp_weights_dir = ( - f"{self.evaluation_tracker.general_config_logger.model_name}-{weights}-applied" - ) - shutil.rmtree(tmp_weights_dir) - hlog(f"Removed {tmp_weights_dir}") - except OSError: - pass + self.evaluation_tracker.general_config_logger.log_end_time() + self.evaluation_tracker.metrics_logger.aggregate(task_dict=self.task_dict, bootstrap_iters=1000) + self.evaluation_tracker.details_logger.aggregate() + + for weights in ["delta", "adapter"]: + try: + tmp_weights_dir = f"{self.evaluation_tracker.general_config_logger.model_name}-{weights}-applied" + shutil.rmtree(tmp_weights_dir) + logger.info(f"Removed {tmp_weights_dir}") + except OSError: + pass def _run_model(self): # Running all requests depending on the model call type (log likelihood, generative, ...) # to be able to batch them + logger.info("--- RUNNING MODEL ---") sample_id_to_responses: dict[(SampleUid, MetricCategory), list[ModelResponse]] = collections.defaultdict(list) for request_type, requests in self.requests.items(): - hlog(f"Running {request_type} requests") + logger.info(f"Running {request_type} requests") run_model = self.model.get_method_from_request_type(request_type=request_type) responses = run_model(requests, override_bs=self.pipeline_parameters.override_batch_size) @@ -301,6 +296,7 @@ def _compute_metrics(self, sample_id_to_responses): # "responses": [[response1_1, response1_2, ...], [response2_1, response2_2, ...], ...], # "docs": [doc1, doc2, ...] # } + logger.info("--- COMPUTING METRICS ---") task_metric_category_groups = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict(list)) ) @@ -333,6 +329,7 @@ def _compute_metrics(self, sample_id_to_responses): self.evaluation_tracker.details_logger.log(task_name, task, doc, response, output) def save_and_push_results(self): + logger.info("--- SAVING AND PUSHING RESULTS ---") if self.is_main_process(): self.evaluation_tracker.save() @@ -342,6 +339,7 @@ def _init_final_dict(self): self.final_dict = self.evaluation_tracker.generate_final_dict() def show_results(self): + logger.info("--- DISPLAYING RESULTS ---") self._init_final_dict() if self.is_main_process(): print(make_results_table(self.final_dict)) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 5b6a3312..c5395281 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -22,6 +22,7 @@ import ast import json +import logging import random import re import string @@ -29,11 +30,13 @@ import numpy as np import pycountry -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.tasks.requests import Doc from lighteval.utils.utils import as_list +logger = logging.getLogger(__name__) + + # fmt: off LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"] INTEGER_INDICES = list(map(str, list(range(1, 27)))) @@ -277,7 +280,9 @@ def bbh_logical_deduction_three_objects(line, task_name: str = None): def bbh_movie_recommendation(line, task_name: str = None): if line["target"] == "Monsters, Inc": # this line is not correctly formatted - hlog_warn("One sample removed from task bbh:movie_recommentation because its line is incorrectly formatted.") + logger.warning( + "One sample removed from task bbh:movie_recommentation because its line is incorrectly formatted." + ) return [] instruction = "Recommend movies similar to the given list of movies.\n\n" choices = [f"({c})" for c in LETTER_INDICES[:6]] @@ -318,7 +323,7 @@ def bbh_reasoning_about_colored_objects(line, task_name: str = None): def bbh_ruin_names(line, task_name: str = None): if line["target"] in ["dearth, wind, & fire", "rita, sue and bob poo"]: # line not correctly formatted - hlog_warn("One sample removed from task bbh:ruin_names because its line is incorrectly formatted.") + logger.warning("One sample removed from task bbh:ruin_names because its line is incorrectly formatted.") return [] instruction = "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n" choices = [f"({c})" for c in LETTER_INDICES[:6]] diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index cba69457..9d08ba12 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -22,6 +22,7 @@ import collections import inspect +import logging import random from dataclasses import asdict, dataclass, field from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple @@ -31,7 +32,6 @@ from multiprocess import Pool from pytablewriter import MarkdownTableWriter -from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.metrics import ( apply_generative_metric, apply_llm_as_judge_metric, @@ -60,6 +60,8 @@ if TYPE_CHECKING: from lighteval.logging.evaluation_tracker import EvaluationTracker +logger = logging.getLogger(__name__) + @dataclass class LightevalTaskConfig: @@ -188,7 +190,7 @@ def __init__( # noqa: C901 self.dataset_filter = cfg.hf_filter self.trust_dataset = cfg.trust_dataset self.dataset: Optional[DatasetDict] = None # Delayed download - hlog(f"{self.dataset_path} {self.dataset_config_name}") + logger.info(f"{self.dataset_path} {self.dataset_config_name}") self._fewshot_docs = None self._docs = None @@ -207,7 +209,7 @@ def __init__( # noqa: C901 ignored = [metric for metric in self.metrics if metric.category == MetricCategory.IGNORED] if len(ignored) > 0: - hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.") + logger.warning(f"Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.") current_categories = [metric.category for metric in self.metrics] self.has_metric_category = {category: (category in current_categories) for category in MetricCategory} @@ -262,7 +264,7 @@ def get_first_possible_fewshot_splits( if len(stored_splits) > 0: return stored_splits[:number_of_splits] - hlog_warn(f"Careful, the task {self.name} is using evaluation data to build the few shot examples.") + logger.warning(f"Careful, the task {self.name} is using evaluation data to build the few shot examples.") return None def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]: diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py index 7555b72a..982a6654 100644 --- a/src/lighteval/tasks/prompt_manager.py +++ b/src/lighteval/tasks/prompt_manager.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging import random from collections import defaultdict from dataclasses import dataclass @@ -27,12 +28,14 @@ from itertools import cycle from typing import TYPE_CHECKING, Optional, Tuple, Union -from lighteval.logging.hierarchical_logger import hlog_warn from lighteval.models.abstract_model import LightevalModel from lighteval.tasks.requests import Doc from lighteval.utils.utils import as_list +logger = logging.getLogger(__name__) + + if TYPE_CHECKING: from lighteval.tasks.lighteval_task import LightevalTask @@ -416,5 +419,5 @@ def get_fewshot_seeds(self, few_shot_iterations: int = None) -> list[int]: if few_shot_iterations <= 1: return [0] seeds = range(few_shot_iterations) - hlog_warn(f"Running {self.task.name} with {few_shot_iterations} few-shot iterations.") + logger.warning(f"Running {self.task.name} with {few_shot_iterations} few-shot iterations.") return seeds diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 612d981d..69532c09 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -22,23 +22,24 @@ import collections import importlib +import logging import os from functools import lru_cache, partial from itertools import groupby from pathlib import Path -from pprint import pformat from types import ModuleType from typing import Callable, Dict, List, Optional, Union from datasets.load import dataset_module_factory import lighteval.tasks.default_tasks as default_tasks -from lighteval.logging.hierarchical_logger import hlog, hlog_warn from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig from lighteval.utils.imports import CANNOT_USE_EXTENDED_TASKS_MSG, can_load_extended_tasks +logger = logging.getLogger(__name__) + # Helm, Bigbench, Harness are implementations following an evaluation suite setup # Original follows the original implementation as closely as possible # Leaderboard are the evaluations we fixed on the open llm leaderboard - you should get similar results @@ -104,8 +105,7 @@ def get_task_instance(self, task_name: str): """ task_class = self.task_registry.get(task_name) if task_class is None: - hlog_warn(f"{task_name} not found in provided tasks") - hlog_warn(pformat(list(self.task_registry.keys()))) + logger.error(f"{task_name} not found in provided tasks") raise ValueError(f"Cannot find tasks {task_name} in task list or in custom task registry)") return task_class() @@ -133,12 +133,12 @@ def task_registry(self): for extended_task_module in AVAILABLE_EXTENDED_TASKS_MODULES: custom_tasks_module.append(extended_task_module) else: - hlog_warn(CANNOT_USE_EXTENDED_TASKS_MSG) + logger.warning(CANNOT_USE_EXTENDED_TASKS_MSG) for module in custom_tasks_module: TASKS_TABLE.extend(module.TASKS_TABLE) # We don't log the tasks themselves as it makes the logs unreadable - hlog(f"Found {len(module.TASKS_TABLE)} custom tasks in {module.__file__}") + logger.info(f"Found {len(module.TASKS_TABLE)} custom tasks in {module.__file__}") if len(TASKS_TABLE) > 0: custom_tasks_registry = create_lazy_tasks(meta_table=TASKS_TABLE, cache_dir=self._cache_dir) @@ -147,7 +147,7 @@ def task_registry(self): # Check the overlap between default_tasks_registry and custom_tasks_registry intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys())) if len(intersection) > 0: - hlog_warn( + logger.warning( f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the default ones on conflict." ) @@ -315,7 +315,9 @@ def taskinfo_selector(tasks: str, task_registry: Registry) -> tuple[list[str], d few_shot = int(few_shot) if suite_name not in DEFAULT_SUITES: - hlog(f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations.") + logger.warning( + f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations." + ) # This adds support for task supersets (eg: mmlu -> all the mmlu tasks) for expanded_task in task_registry.expand_task_definition(f"{suite_name}|{task_name}"): @@ -348,7 +350,7 @@ def create_lazy_tasks( # Every task is renamed suite|task, if the suite is in DEFAULT_SUITE for config in meta_table: if not any(suite in config.suite for suite in DEFAULT_SUITES): - hlog_warn( + logger.warning( f"This evaluation is not in any known suite: {config.name} is in {config.suite}, not in {DEFAULT_SUITES}. Skipping." ) continue diff --git a/src/lighteval/utils/parallelism.py b/src/lighteval/utils/parallelism.py index 892725d9..3308240c 100644 --- a/src/lighteval/utils/parallelism.py +++ b/src/lighteval/utils/parallelism.py @@ -23,10 +23,10 @@ import functools import gc import inspect +import logging import torch -from lighteval.logging.hierarchical_logger import hlog, logger from lighteval.utils.imports import ( NO_ACCELERATE_ERROR_MSG, NO_NANOTRON_ERROR_MSG, @@ -35,6 +35,9 @@ ) +logger = logging.getLogger(__name__) + + def should_reduce_batch_size(exception: Exception) -> bool: """ Checks if `exception` relates to CUDA out-of-memory, CUDNN not supported, or CPU out-of-memory @@ -127,10 +130,10 @@ def test_all_gather(accelerator=None, parallel_context=None): if accelerator: if not is_accelerate_available(): raise ImportError(NO_ACCELERATE_ERROR_MSG) - hlog("Test gather tensor") + logger.info("Test gather tensor") test_tensor: torch.Tensor = torch.tensor([accelerator.process_index], device=accelerator.device) gathered_tensor: torch.Tensor = accelerator.gather(test_tensor) - hlog(f"gathered_tensor {gathered_tensor}, should be {list(range(accelerator.num_processes))}") + logger.info(f"gathered_tensor {gathered_tensor}, should be {list(range(accelerator.num_processes))}") accelerator.wait_for_everyone() elif parallel_context: if not is_nanotron_available(): @@ -138,7 +141,7 @@ def test_all_gather(accelerator=None, parallel_context=None): from nanotron import distributed as dist from nanotron import logging - hlog("Test gather tensor") + logger.info("Test gather tensor") # Do a first NCCL sync to warmup and try to avoid Timeout after model/data loading logging.log_rank( f"[TEST] Running NCCL sync for ranks {list(range(parallel_context.world_pg.size()))}", @@ -162,4 +165,4 @@ def test_all_gather(accelerator=None, parallel_context=None): del test_tensor_list del test_tensor else: - hlog("Not running in a parallel setup, nothing to test") + logger.info("Not running in a parallel setup, nothing to test")