diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index aebf6d33..7b980508 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -36,6 +36,8 @@ jobs: - name: Test env: HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }} + HF_HOME: "cache/models" + HF_DATASETS_CACHE: "cache/datasets" run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models" python -m pytest --disable-pytest-warnings - name: Write cache diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx index 752c4e54..2fbff552 100644 --- a/docs/source/adding-a-custom-task.mdx +++ b/docs/source/adding-a-custom-task.mdx @@ -191,8 +191,7 @@ Once your file is created you can then run the evaluation with the following com ```bash lighteval accelerate \ - --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \ - --tasks "community|{custom_task}|{fewshots}|{truncate_few_shot}" \ - --custom_tasks {path_to_your_custom_task_file} \ - --output_dir "./evals" + "pretrained=HuggingFaceH4/zephyr-7b-beta" \ + "community|{custom_task}|{fewshots}|{truncate_few_shot}" \ + --custom-tasks {path_to_your_custom_task_file} ``` diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx index 9b167d21..f340fabb 100644 --- a/docs/source/available-tasks.mdx +++ b/docs/source/available-tasks.mdx @@ -3,7 +3,13 @@ You can get a list of all the available tasks by running: ```bash -lighteval tasks --list +lighteval tasks list +``` + +You can also inspect a specific task by running: + +```bash +lighteval tasks inspect ``` ## List of tasks diff --git a/docs/source/evaluate-the-model-on-a-server-or-container.mdx b/docs/source/evaluate-the-model-on-a-server-or-container.mdx index da8f1d4b..0d9a7d12 100644 --- a/docs/source/evaluate-the-model-on-a-server-or-container.mdx +++ b/docs/source/evaluate-the-model-on-a-server-or-container.mdx @@ -6,10 +6,9 @@ to the server. The command is the same as before, except you specify a path to a yaml config file (detailed below): ```bash -lighteval accelerate \ - --model_config_path="/path/to/config/file"\ - --tasks \ - --output_dir output_dir +lighteval endpoint {tgi,inference-endpoint} \ + "/path/to/config/file"\ + ``` There are two types of configuration files that can be provided for running on @@ -65,3 +64,19 @@ model: inference_server_auth: null model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory ``` + +### OpenAI API + +Lighteval also supports evaluating models on the OpenAI API. To do so you need to set your OpenAI API key in the environment variable. + +```bash +export OPENAI_API_KEY={your_key} +``` + +And then run the following command: + +```bash +lighteval endpoint openai \ + {model-name} \ + +``` diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 9c055f5e..fa1895b7 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -5,7 +5,7 @@ backends—whether it's [transformers](https://github.com/huggingface/transformers), [tgi](https://github.com/huggingface/text-generation-inference), [vllm](https://github.com/vllm-project/vllm), or -[nanotron](https://github.com/huggingface/nanotron)—with +[nanotron](https://github.com/huggingface/nanotron)-with ease. Dive deep into your model’s performance by saving and exploring detailed, sample-by-sample results to debug and see how your models stack-up. diff --git a/docs/source/package_reference/model_config.mdx b/docs/source/package_reference/model_config.mdx index c70258bb..e2ecceb4 100644 --- a/docs/source/package_reference/model_config.mdx +++ b/docs/source/package_reference/model_config.mdx @@ -8,5 +8,3 @@ [[autodoc]] models.model_config.InferenceModelConfig [[autodoc]] models.model_config.TGIModelConfig [[autodoc]] models.model_config.VLLMModelConfig - -[[autodoc]] models.model_config.create_model_config diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx index 5f66547e..b2190245 100644 --- a/docs/source/quicktour.mdx +++ b/docs/source/quicktour.mdx @@ -1,11 +1,24 @@ # Quicktour -We provide two main entry points to evaluate models: + +> [!TIP] +> We recommend using the `--help` flag to get more information about the +> available options for each command. +> `lighteval --help` + +Lighteval can be used with a few different commands. - `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [🤗 Accelerate](https://github.com/huggingface/accelerate) - `lighteval nanotron`: evaluate models in distributed settings using [⚡️ Nanotron](https://github.com/huggingface/nanotron) +- `lighteval vllm`: evaluate models on one or more GPUs using [🚀 + VLLM](https://github.com/vllm-project/vllm) +- `lighteval endpoint` + - `inference-endpoint`: evaluate models on one or more GPUs using [🔗 + Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated) + - `tgi`: evaluate models on one or more GPUs using [🔗 Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index) + - `openai`: evaluate models on one or more GPUs using [🔗 OpenAI API](https://platform.openai.com/) ## Accelerate @@ -15,10 +28,8 @@ To evaluate `GPT-2` on the Truthful QA benchmark, run: ```bash lighteval accelerate \ - --model_args "pretrained=gpt2" \ - --tasks "leaderboard|truthfulqa:mc|0|0" \ - --override_batch_size 1 \ - --output_dir="./evals/" + "pretrained=gpt2" \ + "leaderboard|truthfulqa:mc|0|0" ``` Here, `--tasks` refers to either a comma-separated list of supported tasks from @@ -51,10 +62,8 @@ You can then evaluate a model using data parallelism on 8 GPUs like follows: ```bash accelerate launch --multi_gpu --num_processes=8 -m \ lighteval accelerate \ - --model_args "pretrained=gpt2" \ - --tasks "leaderboard|truthfulqa:mc|0|0" \ - --override_batch_size 1 \ - --output_dir="./evals/" + "pretrained=gpt2" \ + "leaderboard|truthfulqa:mc|0|0" ``` Here, `--override_batch_size` defines the batch size per device, so the effective @@ -66,10 +75,8 @@ To evaluate a model using pipeline parallelism on 2 or more GPUs, run: ```bash lighteval accelerate \ - --model_args "pretrained=gpt2,model_parallel=True" \ - --tasks "leaderboard|truthfulqa:mc|0|0" \ - --override_batch_size 1 \ - --output_dir="./evals/" + "pretrained=gpt2,model_parallel=True" \ + "leaderboard|truthfulqa:mc|0|0" ``` This will automatically use accelerate to distribute the model across the GPUs. @@ -81,7 +88,7 @@ GPUs. ### Model Arguments -The `--model_args` argument takes a string representing a list of model +The `model-args` argument takes a string representing a list of model argument. The arguments allowed vary depending on the backend you use (vllm or accelerate). @@ -150,8 +157,8 @@ To evaluate a model trained with nanotron on a single gpu. ```bash torchrun --standalone --nnodes=1 --nproc-per-node=1 \ src/lighteval/__main__.py nanotron \ - --checkpoint_config_path ../nanotron/checkpoints/10/config.yaml \ - --lighteval_config_path examples/nanotron/lighteval_config_override_template.yaml + --checkpoint-config-path ../nanotron/checkpoints/10/config.yaml \ + --lighteval-config-path examples/nanotron/lighteval_config_override_template.yaml ``` The `nproc-per-node` argument should match the data, tensor and pipeline diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx index b50cdee6..8c347cee 100644 --- a/docs/source/saving-and-reading-results.mdx +++ b/docs/source/saving-and-reading-results.mdx @@ -3,30 +3,32 @@ ## Saving results locally Lighteval will automatically save results and evaluation details in the -directory set with the `--output_dir` argument. The results will be saved in +directory set with the `--output-dir` option. The results will be saved in `{output_dir}/results/{model_name}/results_{timestamp}.json`. [Here is an example of a result file](#example-of-a-result-file). The output path can be any [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html) compliant path (local, s3, hf hub, gdrive, ftp, etc). -To save the details of the evaluation, you can use the `--save_details` -argument. The details will be saved in a parquet file +To save the details of the evaluation, you can use the `--save-details` +option. The details will be saved in a parquet file `{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`. ## Pushing results to the HuggingFace hub You can push the results and evaluation details to the HuggingFace hub. To do -so, you need to set the `--push_to_hub` as well as the `--results_org` -argument. The results will be saved in a dataset with the name at +so, you need to set the `--push-to-hub` as well as the `--results-org` +option. The results will be saved in a dataset with the name at `{results_org}/{model_org}/{model_name}`. To push the details, you need to set -the `--save_details` argument. +the `--save-details` option. The dataset created will be private by default, you can make it public by -setting the `--public_run` argument. +setting the `--public-run` option. ## Pushing results to Tensorboard -You can push the results to Tensorboard by setting `--push_to_tensorboard`. +You can push the results to Tensorboard by setting `--push-to-tensorboard`. +This will create a Tensorboard dashboard in a HF org set with the `--results-org` +option. ## How to load and investigate details diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx index 153ff659..787848c3 100644 --- a/docs/source/use-vllm-as-backend.mdx +++ b/docs/source/use-vllm-as-backend.mdx @@ -4,10 +4,9 @@ Lighteval allows you to use `vllm` as backend allowing great speedups. To use, simply change the `model_args` to reflect the arguments you want to pass to vllm. ```bash -lighteval accelerate \ - --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \ - --tasks "leaderboard|truthfulqa:mc|0|0" \ - --output_dir="./evals/" +lighteval vllm \ + "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \ + "leaderboard|truthfulqa:mc|0|0" ``` `vllm` is able to distribute the model across multiple GPUs using data @@ -17,19 +16,17 @@ You can choose the parallelism method by setting in the the `model_args`. For example if you have 4 GPUs you can split it across using `tensor_parallelism`: ```bash -export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval accelerate \ - --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \ - --tasks "leaderboard|truthfulqa:mc|0|0" \ - --output_dir="./evals/" +export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \ + "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \ + "leaderboard|truthfulqa:mc|0|0" ``` Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation: ```bash -lighteval accelerate \ - --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \ - --tasks "leaderboard|truthfulqa:mc|0|0" \ - --output_dir="./evals/" +lighteval vllm \ + "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \ + "leaderboard|truthfulqa:mc|0|0" ``` Available arguments for `vllm` can be found in the `VLLMModelConfig`: @@ -50,4 +47,3 @@ Available arguments for `vllm` can be found in the `VLLMModelConfig`: > [!WARNING] > In the case of OOM issues, you might need to reduce the context size of the > model as well as reduce the `gpu_memory_utilisation` parameter. - diff --git a/examples/model_configs/base_model.yaml b/examples/model_configs/base_model.yaml index 925d3e50..d6563e61 100644 --- a/examples/model_configs/base_model.yaml +++ b/examples/model_configs/base_model.yaml @@ -1,5 +1,4 @@ model: - type: "base" # can be base, tgi, or endpoint base_params: model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... dtype: "bfloat16" diff --git a/examples/model_configs/endpoint_model.yaml b/examples/model_configs/endpoint_model.yaml index 4bf2f060..c3f5222b 100644 --- a/examples/model_configs/endpoint_model.yaml +++ b/examples/model_configs/endpoint_model.yaml @@ -1,5 +1,4 @@ model: - type: "endpoint" # can be base, tgi, or endpoint base_params: endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters model: "meta-llama/Llama-2-7b-hf" diff --git a/examples/model_configs/peft_model.yaml b/examples/model_configs/peft_model.yaml index d94ff610..81205818 100644 --- a/examples/model_configs/peft_model.yaml +++ b/examples/model_configs/peft_model.yaml @@ -1,5 +1,4 @@ model: - type: "base" base_params: model_args: "pretrained=predibase/customer_support,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied. dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. diff --git a/examples/model_configs/quantized_model.yaml b/examples/model_configs/quantized_model.yaml index dfac1c95..3bc6b2c3 100644 --- a/examples/model_configs/quantized_model.yaml +++ b/examples/model_configs/quantized_model.yaml @@ -1,5 +1,4 @@ model: - type: "base" base_params: model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. diff --git a/examples/model_configs/tgi_model.yaml b/examples/model_configs/tgi_model.yaml index 82ac50a7..8db5654d 100644 --- a/examples/model_configs/tgi_model.yaml +++ b/examples/model_configs/tgi_model.yaml @@ -1,5 +1,4 @@ model: - type: "tgi" # can be base, tgi, or endpoint instance: inference_server_address: "" inference_server_auth: null diff --git a/examples/nanotron/lighteval_config_override_template.yaml b/examples/nanotron/lighteval_config_override_template.yaml index 03b65596..50886ced 100644 --- a/examples/nanotron/lighteval_config_override_template.yaml +++ b/examples/nanotron/lighteval_config_override_template.yaml @@ -4,9 +4,7 @@ generation: null logging: output_dir: "outputs" save_details: false - push_results_to_hub: false - push_details_to_hub: false - push_results_to_tensorboard: false + push_to_hub: false public_run: false results_org: null tensorboard_metric_prefix: "eval" diff --git a/pyproject.toml b/pyproject.toml index 7d17c8d1..1a99b6a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,6 +61,7 @@ dependencies = [ "datasets>=2.14.0", "numpy<2", # pinned to avoid incompatibilities # Prettiness + "typer", "termcolor==2.3.0", "pytablewriter", "colorama", @@ -114,4 +115,4 @@ Issues = "https://github.com/huggingface/lighteval/issues" # Changelog = "https://github.com/huggingface/lighteval/blob/master/CHANGELOG.md" [project.scripts] -lighteval = "lighteval.__main__:cli_evaluate" +lighteval = "lighteval.__main__:app" diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py index b0164b2d..c715723d 100644 --- a/src/lighteval/__main__.py +++ b/src/lighteval/__main__.py @@ -1,5 +1,3 @@ -#!/usr/bin/env python - # MIT License # Copyright (c) 2024 Taratra D. RAHARISON and The HuggingFace Team @@ -22,81 +20,36 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import typer -import argparse -import os -from dataclasses import asdict -from pprint import pformat - -from lighteval.parsers import parser_accelerate, parser_baseline, parser_nanotron, parser_utils_tasks -from lighteval.tasks.registry import Registry, taskinfo_selector - - -CACHE_DIR = os.getenv("HF_HOME") - - -def cli_evaluate(): # noqa: C901 - parser = argparse.ArgumentParser(description="CLI tool for lighteval, a lightweight framework for LLM evaluation") - subparsers = parser.add_subparsers(help="help for subcommand", dest="subcommand") - - # Subparser for the "accelerate" command - parser_a = subparsers.add_parser("accelerate", help="use accelerate and transformers as backend for evaluation.") - parser_accelerate(parser_a) - - # Subparser for the "nanotron" command - parser_b = subparsers.add_parser("nanotron", help="use nanotron as backend for evaluation.") - parser_nanotron(parser_b) - - parser_c = subparsers.add_parser("baseline", help="compute baseline for a task") - parser_baseline(parser_c) - - # Subparser for task utils functions - parser_d = subparsers.add_parser("tasks", help="display information about available tasks and samples.") - parser_utils_tasks(parser_d) - - args = parser.parse_args() - - if args.subcommand == "accelerate": - from lighteval.main_accelerate import main as main_accelerate - - main_accelerate(args) - - elif args.subcommand == "nanotron": - from lighteval.main_nanotron import main as main_nanotron - - main_nanotron(args.checkpoint_config_path, args.lighteval_config_path, args.cache_dir) +import lighteval.main_accelerate +import lighteval.main_baseline +import lighteval.main_endpoint +import lighteval.main_nanotron +import lighteval.main_tasks +import lighteval.main_vllm - elif args.subcommand == "baseline": - from lighteval.main_baseline import main as main_baseline - main_baseline(args) +app = typer.Typer() - elif args.subcommand == "tasks": - registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks) - if args.list: - registry.print_all_tasks() - if args.inspect: - print(f"Loading the tasks dataset to cache folder: {args.cache_dir}") - print( - "All examples will be displayed without few shot, as few shot sample construction requires loading a model and using its tokenizer. " - ) - # Loading task - task_names_list, _ = taskinfo_selector(args.inspect, task_registry=registry) - task_dict = registry.get_task_dict(task_names_list) - for name, task in task_dict.items(): - print("-" * 10, name, "-" * 10) - if args.show_config: - print("-" * 10, "CONFIG") - task.cfg.print() - for ix, sample in enumerate(task.eval_docs()[: int(args.num_samples)]): - if ix == 0: - print("-" * 10, "SAMPLES") - print(f"-- sample {ix} --") - print(pformat(asdict(sample), indent=1)) - else: - print("You did not provide any argument. Exiting") +app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate) +app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline) +app.command(rich_help_panel="Evaluation Backends")(lighteval.main_nanotron.nanotron) +app.command(rich_help_panel="Evaluation Backends")(lighteval.main_vllm.vllm) +app.add_typer( + lighteval.main_endpoint.app, + name="endpoint", + rich_help_panel="Evaluation Backends", + help="Evaluate models using some endpoint (tgi, inference endpoint, openai) as backend.", +) +app.add_typer( + lighteval.main_tasks.app, + name="tasks", + rich_help_panel="Utils", + help="List or inspect tasks.", +) if __name__ == "__main__": - cli_evaluate() + app() diff --git a/src/lighteval/logging/hierarchical_logger.py b/src/lighteval/logging/hierarchical_logger.py index 1c4c3a11..ac8d59d8 100644 --- a/src/lighteval/logging/hierarchical_logger.py +++ b/src/lighteval/logging/hierarchical_logger.py @@ -26,24 +26,10 @@ from logging import Logger from typing import Any, Callable -from lighteval.utils.imports import is_accelerate_available, is_nanotron_available - - -if is_nanotron_available(): - from nanotron.logging import get_logger - - logger = get_logger(__name__, log_level="INFO") -elif is_accelerate_available(): - from accelerate import Accelerator, InitProcessGroupKwargs - from accelerate.logging import get_logger +from colorama import Fore, Style - # We must init the accelerator before using the logger - accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) - logger = get_logger(__name__, log_level="INFO") -else: - logger = Logger(__name__, level="INFO") -from colorama import Fore, Style +logger = Logger(__name__, level="INFO") class HierarchicalLogger: diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index 67396be8..f6ed6b38 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -20,63 +20,173 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging import os -from datetime import timedelta +from typing import Optional -from lighteval.logging.evaluation_tracker import EvaluationTracker -from lighteval.logging.hierarchical_logger import hlog_warn, htrack -from lighteval.models.model_config import create_model_config -from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters -from lighteval.utils.imports import is_accelerate_available, is_tgi_available +from typer import Argument, Option +from typing_extensions import Annotated -if not is_accelerate_available() and not is_tgi_available(): - hlog_warn("Using either accelerate or text-generation to run this script is advised.") +logger = logging.getLogger(__name__) TOKEN = os.getenv("HF_TOKEN") - -if is_accelerate_available(): +CACHE_DIR: str = os.getenv("HF_HOME", "/scratch") + +HELP_PANNEL_NAME_1 = "Common Paramaters" +HELP_PANNEL_NAME_2 = "Logging Parameters" +HELP_PANNEL_NAME_3 = "Debug Paramaters" +HELP_PANNEL_NAME_4 = "Modeling Paramaters" + + +def accelerate( # noqa C901 + # === general === + model_args: Annotated[ + str, + Argument( + help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/base_model.yaml)" + ), + ], + tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + # === Common parameters === + use_chat_template: Annotated[ + bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = False, + system_prompt: Annotated[ + Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = None, + dataset_loading_processes: Annotated[ + int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + custom_tasks: Annotated[ + Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = None, + cache_dir: Annotated[ + Optional[str], Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = None, + num_fewshot_seeds: Annotated[ + int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + # === saving === + output_dir: Annotated[ + str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = "results", + push_to_hub: Annotated[ + bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + push_to_tensorboard: Annotated[ + bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + public_run: Annotated[ + bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + results_org: Annotated[ + Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = None, + save_details: Annotated[ + bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + # === debug === + max_samples: Annotated[ + Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = None, + override_batch_size: Annotated[ + int, Option(help="Override batch size for evaluation.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = -1, + job_id: Annotated[ + int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = 0, +): + """ + Evaluate models using accelerate and transformers as backend. + """ + from datetime import timedelta + + import torch + import yaml from accelerate import Accelerator, InitProcessGroupKwargs - accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) -else: - accelerator = None + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.models.model_config import AdapterModelConfig, BaseModelConfig, BitsAndBytesConfig, DeltaModelConfig + from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters + accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) + cache_dir = CACHE_DIR -@htrack() -def main(args): - env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir) + env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir) evaluation_tracker = EvaluationTracker( - output_dir=args.output_dir, - save_details=args.save_details, - push_to_hub=args.push_to_hub, - push_to_tensorboard=args.push_to_tensorboard, - public=args.public_run, - hub_results_org=args.results_org, + output_dir=output_dir, + save_details=save_details, + push_to_hub=push_to_hub, + push_to_tensorboard=push_to_tensorboard, + public=public_run, + hub_results_org=results_org, ) pipeline_params = PipelineParameters( launcher_type=ParallelismManager.ACCELERATE, env_config=env_config, - job_id=args.job_id, - dataset_loading_processes=args.dataset_loading_processes, - custom_tasks_directory=args.custom_tasks, - override_batch_size=args.override_batch_size, - num_fewshot_seeds=args.num_fewshot_seeds, - max_samples=args.max_samples, - use_chat_template=args.use_chat_template, - system_prompt=args.system_prompt, + job_id=job_id, + dataset_loading_processes=dataset_loading_processes, + custom_tasks_directory=custom_tasks, + override_batch_size=override_batch_size, + num_fewshot_seeds=num_fewshot_seeds, + max_samples=max_samples, + use_chat_template=use_chat_template, + system_prompt=system_prompt, ) - model_config = create_model_config( - use_chat_template=args.use_chat_template, - override_batch_size=args.override_batch_size, - model_args=args.model_args, - model_config_path=args.model_config_path, - accelerator=accelerator, - ) + # TODO (nathan): better handling of model_args + if model_args.endswith(".yaml"): + with open(model_args, "r") as f: + config = yaml.safe_load(f)["model"] + + # Creating optional quantization configuration + if config["base_params"]["dtype"] == "4bit": + quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) + elif config["base_params"]["dtype"] == "8bit": + quantization_config = BitsAndBytesConfig(load_in_8bit=True) + else: + quantization_config = None + + # We extract the model args + args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")} + + # We store the relevant other args + args_dict["base_model"] = config["merged_weights"]["base_model"] + args_dict["compile"] = bool(config["base_params"]["compile"]) + args_dict["dtype"] = config["base_params"]["dtype"] + args_dict["accelerator"] = accelerator + args_dict["quantization_config"] = quantization_config + args_dict["batch_size"] = override_batch_size + args_dict["multichoice_continuations_start_space"] = config["generation"][ + "multichoice_continuations_start_space" + ] + args_dict["use_chat_template"] = use_chat_template + + # Keeping only non null params + args_dict = {k: v for k, v in args_dict.items() if v is not None} + + if config["merged_weights"]["delta_weights"]: + if config["merged_weights"]["base_model"] is None: + raise ValueError("You need to specify a base model when using delta weights") + model_config = DeltaModelConfig(**args_dict) + elif config["merged_weights"]["adapter_weights"]: + if config["merged_weights"]["base_model"] is None: + raise ValueError("You need to specify a base model when using adapter weights") + model_config = AdapterModelConfig(**args_dict) + elif config["merged_weights"]["base_model"] not in ["", None]: + raise ValueError("You can't specify a base model if you are not using delta/adapter weights") + else: + model_config = BaseModelConfig(**args_dict) + else: + model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} + model_args_dict["accelerator"] = accelerator + model_args_dict["use_chat_template"] = use_chat_template + model_args_dict["compile"] = bool(model_args_dict["compile"]) if "compile" in model_args_dict else False + model_config = BaseModelConfig(**model_args_dict) pipeline = Pipeline( - tasks=args.tasks, + tasks=tasks, pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model_config=model_config, diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py index f824d94f..dd478667 100644 --- a/src/lighteval/main_baseline.py +++ b/src/lighteval/main_baseline.py @@ -20,33 +20,66 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from lighteval.logging.evaluation_tracker import EvaluationTracker -from lighteval.metrics.utils.metric_utils import MetricCategory -from lighteval.models.abstract_model import ModelInfo -from lighteval.tasks.lighteval_task import LightevalTask -from lighteval.tasks.registry import Registry, taskinfo_selector -from lighteval.utils.utils import as_list - -def main(args): +import os +from typing import Optional + +from typer import Argument, Option +from typing_extensions import Annotated + + +CACHE_DIR: str = os.getenv("HF_HOME", "/scratch") + +HELP_PANNEL_NAME_1 = "Common Paramaters" +HELP_PANNEL_NAME_2 = "Logging Parameters" +HELP_PANNEL_NAME_3 = "Debug Paramaters" +HELP_PANNEL_NAME_4 = "Modeling Paramaters" + + +def baseline( + tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + cache_dir: Annotated[ + str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = CACHE_DIR, + custom_tasks: Annotated[ + Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = None, + dataset_loading_processes: Annotated[ + int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + output_dir: Annotated[ + str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = "results", + max_samples: Annotated[ + Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = None, +): """ Compute baselines for given tasks. It has been tested with generative and accuracy tasks, but may not work correctly for other task types. The baseline is computed as follows: + - For multiple-choice tasks: It assumes random guessing, so the score is n_correct/number_of_choices. - For other metrics: It assigns a score of 0, which may not be appropriate for all task types. Note: This baseline computation may not be suitable for all task types and should be used with caution. """ - task_registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks) - task_names_list, fewshots_dict = taskinfo_selector(args.tasks, task_registry) + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.metrics.utils.metric_utils import MetricCategory + from lighteval.models.abstract_model import ModelInfo + from lighteval.tasks.lighteval_task import LightevalTask + from lighteval.tasks.registry import Registry, taskinfo_selector + from lighteval.utils.utils import as_list + + task_registry = Registry(cache_dir=cache_dir, custom_tasks=custom_tasks) + task_names_list, fewshots_dict = taskinfo_selector(tasks, task_registry) task_dict = task_registry.get_task_dict(task_names_list) evaluation_tracker = EvaluationTracker( - output_dir=args.output_dir, + output_dir=output_dir, save_details=False, push_to_hub=False, push_to_tensorboard=False, @@ -63,11 +96,11 @@ def main(args): ) evaluation_tracker.task_config_logger.log(task_dict) - LightevalTask.load_datasets(list(task_dict.values()), args.dataset_loading_processes) + LightevalTask.load_datasets(list(task_dict.values()), dataset_loading_processes) for task_name, task in task_dict.items(): task_docs = list(task.eval_docs()) - n_samples = min(args.max_samples, len(task_docs)) if args.max_samples else len(task_docs) + n_samples = min(max_samples, len(task_docs)) if max_samples else len(task_docs) p_correct_score = [ len(as_list(task_doc.gold_index)) / len(task_doc.choices) for task_doc in task_docs[:n_samples] diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py new file mode 100644 index 00000000..877be2df --- /dev/null +++ b/src/lighteval/main_endpoint.py @@ -0,0 +1,397 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import os +from typing import Optional + +import typer +from typer import Argument, Option +from typing_extensions import Annotated + + +app = typer.Typer() + + +TOKEN = os.getenv("HF_TOKEN") +CACHE_DIR: str = os.getenv("HF_HOME", "/scratch") + +HELP_PANNEL_NAME_1 = "Common Paramaters" +HELP_PANNEL_NAME_2 = "Logging Parameters" +HELP_PANNEL_NAME_3 = "Debug Paramaters" +HELP_PANNEL_NAME_4 = "Modeling Paramaters" + + +@app.command(rich_help_panel="Evaluation Backends") +def openai( + # === general === + model_name: Annotated[ + str, Argument(help="The model name to evaluate (has to be available through the openai API.") + ], + tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + # === Common parameters === + system_prompt: Annotated[ + Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = None, + dataset_loading_processes: Annotated[ + int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + custom_tasks: Annotated[ + Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = None, + cache_dir: Annotated[ + str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = CACHE_DIR, + num_fewshot_seeds: Annotated[ + int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + # === saving === + output_dir: Annotated[ + str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = "results", + push_to_hub: Annotated[ + bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + push_to_tensorboard: Annotated[ + bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + public_run: Annotated[ + bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + results_org: Annotated[ + Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = None, + save_details: Annotated[ + bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + # === debug === + max_samples: Annotated[ + Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = None, + job_id: Annotated[ + int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = 0, +): + """ + Evaluate OPENAI models. + """ + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.models.model_config import OpenAIModelConfig + from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters + + env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir) + evaluation_tracker = EvaluationTracker( + output_dir=output_dir, + save_details=save_details, + push_to_hub=push_to_hub, + push_to_tensorboard=push_to_tensorboard, + public=public_run, + hub_results_org=results_org, + ) + + parallelism_manager = ParallelismManager.OPENAI + model_config = OpenAIModelConfig(model=model_name) + + pipeline_params = PipelineParameters( + launcher_type=parallelism_manager, + env_config=env_config, + job_id=job_id, + dataset_loading_processes=dataset_loading_processes, + custom_tasks_directory=custom_tasks, + override_batch_size=-1, # Cannot override batch size when using OpenAI + num_fewshot_seeds=num_fewshot_seeds, + max_samples=max_samples, + use_chat_template=False, # Cannot use chat template when using OpenAI + system_prompt=system_prompt, + ) + pipeline = Pipeline( + tasks=tasks, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config, + ) + + pipeline.evaluate() + + pipeline.show_results() + + results = pipeline.get_results() + + pipeline.save_and_push_results() + + return results + + +@app.command(rich_help_panel="Evaluation Backends") +def inference_endpoint( + # === general === + model_config_path: Annotated[ + str, Argument(help="Path to model config yaml file. (examples/model_configs/endpoint_model.yaml)") + ], + tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + # === Common parameters === + use_chat_template: Annotated[ + bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = False, + system_prompt: Annotated[ + Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = None, + dataset_loading_processes: Annotated[ + int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + custom_tasks: Annotated[ + Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = None, + cache_dir: Annotated[ + str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = CACHE_DIR, + num_fewshot_seeds: Annotated[ + int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + # === saving === + output_dir: Annotated[ + str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = "results", + push_to_hub: Annotated[ + bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + push_to_tensorboard: Annotated[ + bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + public_run: Annotated[ + bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + results_org: Annotated[ + Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = None, + save_details: Annotated[ + bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + # === debug === + max_samples: Annotated[ + Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = None, + override_batch_size: Annotated[ + int, Option(help="Override batch size for evaluation.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = -1, + job_id: Annotated[ + int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = 0, +): + """ + Evaluate models using inference-endpoints as backend. + """ + import yaml + + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.models.model_config import ( + InferenceEndpointModelConfig, + InferenceModelConfig, + ) + from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters + + env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir) + evaluation_tracker = EvaluationTracker( + output_dir=output_dir, + save_details=save_details, + push_to_hub=push_to_hub, + push_to_tensorboard=push_to_tensorboard, + public=public_run, + hub_results_org=results_org, + ) + + # TODO (nathan): better handling of model_args + + parallelism_manager = ParallelismManager.TGI + + with open(model_config_path, "r") as f: + config = yaml.safe_load(f)["model"] + + reuse_existing_endpoint = config["base_params"].get("reuse_existing", None) + + complete_config_endpoint = all( + val not in [None, ""] + for key, val in config.get("instance", {}).items() + if key not in InferenceEndpointModelConfig.nullable_keys() + ) + + if reuse_existing_endpoint or complete_config_endpoint: + model_config = InferenceEndpointModelConfig( + name=config["base_params"]["endpoint_name"].replace(".", "-").lower(), + repository=config["base_params"]["model"], + model_dtype=config["base_params"]["dtype"], + revision=config["base_params"]["revision"] or "main", + should_reuse_existing=reuse_existing_endpoint, + accelerator=config["instance"]["accelerator"], + region=config["instance"]["region"], + vendor=config["instance"]["vendor"], + instance_size=config["instance"]["instance_size"], + instance_type=config["instance"]["instance_type"], + namespace=config["instance"]["namespace"], + image_url=config["instance"].get("image_url", None), + env_vars=config["instance"].get("env_vars", None), + ) + else: + model_config = InferenceModelConfig(model=config["base_params"]["endpoint_name"]) + + pipeline_params = PipelineParameters( + launcher_type=parallelism_manager, + env_config=env_config, + job_id=job_id, + dataset_loading_processes=dataset_loading_processes, + custom_tasks_directory=custom_tasks, + override_batch_size=override_batch_size, + num_fewshot_seeds=num_fewshot_seeds, + max_samples=max_samples, + use_chat_template=use_chat_template, + system_prompt=system_prompt, + ) + pipeline = Pipeline( + tasks=tasks, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config, + ) + + pipeline.evaluate() + + pipeline.show_results() + + results = pipeline.get_results() + + pipeline.save_and_push_results() + + return results + + +@app.command(rich_help_panel="Evaluation Backends") +def tgi( + # === general === + model_config_path: Annotated[ + str, Argument(help="Path to model config yaml file. (examples/model_configs/tgi_model.yaml)") + ], + tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + # === Common parameters === + use_chat_template: Annotated[ + bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = False, + system_prompt: Annotated[ + Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = None, + dataset_loading_processes: Annotated[ + int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + custom_tasks: Annotated[ + Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = None, + cache_dir: Annotated[ + str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = CACHE_DIR, + num_fewshot_seeds: Annotated[ + int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + # === saving === + output_dir: Annotated[ + str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = "results", + push_to_hub: Annotated[ + bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + push_to_tensorboard: Annotated[ + bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + public_run: Annotated[ + bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + results_org: Annotated[ + Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = None, + save_details: Annotated[ + bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + # === debug === + max_samples: Annotated[ + Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = None, + override_batch_size: Annotated[ + int, Option(help="Override batch size for evaluation.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = -1, + job_id: Annotated[ + int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = 0, +): + """ + Evaluate models using TGI as backend. + """ + import yaml + + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.models.model_config import TGIModelConfig + from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters + + env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir) + evaluation_tracker = EvaluationTracker( + output_dir=output_dir, + save_details=save_details, + push_to_hub=push_to_hub, + push_to_tensorboard=push_to_tensorboard, + public=public_run, + hub_results_org=results_org, + ) + + # TODO (nathan): better handling of model_args + parallelism_manager = ParallelismManager.TGI + with open(model_config_path, "r") as f: + config = yaml.safe_load(f)["model"] + + model_config = TGIModelConfig( + inference_server_address=config["instance"]["inference_server_address"], + inference_server_auth=config["instance"]["inference_server_auth"], + model_id=config["instance"]["model_id"], + ) + + pipeline_params = PipelineParameters( + launcher_type=parallelism_manager, + env_config=env_config, + job_id=job_id, + dataset_loading_processes=dataset_loading_processes, + custom_tasks_directory=custom_tasks, + override_batch_size=override_batch_size, + num_fewshot_seeds=num_fewshot_seeds, + max_samples=max_samples, + use_chat_template=use_chat_template, + system_prompt=system_prompt, + ) + pipeline = Pipeline( + tasks=tasks, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config, + ) + + pipeline.evaluate() + + pipeline.show_results() + + results = pipeline.get_results() + + pipeline.save_and_push_results() + + return results diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index a00ef884..66826122 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -22,33 +22,46 @@ # flake8: noqa: C901 import os -from typing import Optional -from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig -from lighteval.logging.evaluation_tracker import EvaluationTracker -from lighteval.logging.hierarchical_logger import htrack, htrack_block -from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters -from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available -from lighteval.utils.utils import EnvConfig +from typer import Option +from typing_extensions import Annotated -if not is_nanotron_available(): - raise ImportError(NO_NANOTRON_ERROR_MSG) +CACHE_DIR: str = os.getenv("HF_HOME", "/scratch") -from nanotron.config import Config, get_config_from_file +HELP_PANNEL_NAME_1 = "Common Paramaters" +HELP_PANNEL_NAME_2 = "Logging Parameters" +HELP_PANNEL_NAME_3 = "Debug Paramaters" +HELP_PANNEL_NAME_4 = "Modeling Paramaters" SEED = 1234 -@htrack() -def main( - checkpoint_config_path: str, - lighteval_config_path: Optional[str] = None, - cache_dir: Optional[str] = os.getenv("HF_HOME", "/scratch"), +def nanotron( + checkpoint_config_path: Annotated[ + str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.") + ], + lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")], + cache_dir: Annotated[str, Option(help="Cache directory for datasets and models.")] = CACHE_DIR, ): + """ + Evaluate models using nanotron as backend. + """ + from nanotron.config import Config, get_config_from_file + + from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.logging.hierarchical_logger import htrack_block + from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters + from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available + from lighteval.utils.utils import EnvConfig + env_config = EnvConfig(token=os.getenv("HF_TOKEN"), cache_dir=cache_dir) + if not is_nanotron_available(): + raise ImportError(NO_NANOTRON_ERROR_MSG) + with htrack_block("Load nanotron config"): # Create nanotron config if not checkpoint_config_path.endswith(".yaml"): diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py new file mode 100644 index 00000000..66834798 --- /dev/null +++ b/src/lighteval/main_tasks.py @@ -0,0 +1,77 @@ +# MIT License + +# Copyright (c) 2024 Taratra D. RAHARISON and The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import os +from typing import Optional + +import typer +from typer import Argument, Option +from typing_extensions import Annotated + + +app = typer.Typer() +CACHE_DIR = os.getenv("HF_HOME") + + +@app.command() +def inspect( + tasks: Annotated[str, Argument(help="Id of tasks or path to a text file with a list of tasks")], + custom_tasks: Annotated[Optional[str], Option(help="Path to a file with custom tasks")] = None, + num_samples: Annotated[int, Option(help="Number of samples to display")] = 10, + show_config: Annotated[bool, Option(help="Will display the full task config")] = False, + cache_dir: Annotated[Optional[str], Option(help="Cache directory used to store datasets and models")] = CACHE_DIR, +): + """ + Inspect a tasks + """ + from dataclasses import asdict + from pprint import pformat + + from rich import print + + from lighteval.tasks.registry import Registry, taskinfo_selector + + registry = Registry(cache_dir=cache_dir, custom_tasks=custom_tasks) + + # Loading task + task_names_list, _ = taskinfo_selector(tasks, task_registry=registry) + task_dict = registry.get_task_dict(task_names_list) + for name, task in task_dict.items(): + print("-" * 10, name, "-" * 10) + if show_config: + print("-" * 10, "CONFIG") + task.cfg.print() + for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]): + if ix == 0: + print("-" * 10, "SAMPLES") + print(f"-- sample {ix} --") + print(pformat(asdict(sample), indent=2)) + + +@app.command() +def list(): + """ + List all tasks + """ + from lighteval.tasks.registry import Registry + + registry = Registry(cache_dir=CACHE_DIR) + registry.print_all_tasks() diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py new file mode 100644 index 00000000..4bd1681d --- /dev/null +++ b/src/lighteval/main_vllm.py @@ -0,0 +1,139 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import os +from typing import Optional + +from typer import Argument, Option +from typing_extensions import Annotated + + +TOKEN = os.getenv("HF_TOKEN") +CACHE_DIR: str = os.getenv("HF_HOME", "/scratch") + +HELP_PANNEL_NAME_1 = "Common Paramaters" +HELP_PANNEL_NAME_2 = "Logging Parameters" +HELP_PANNEL_NAME_3 = "Debug Paramaters" +HELP_PANNEL_NAME_4 = "Modeling Paramaters" + + +def vllm( + # === general === + model_args: Annotated[str, Argument(help="Model arguments in the form key1=value1,key2=value2,...")], + tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")], + # === Common parameters === + use_chat_template: Annotated[ + bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = False, + system_prompt: Annotated[ + Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4) + ] = None, + dataset_loading_processes: Annotated[ + int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + custom_tasks: Annotated[ + Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = None, + cache_dir: Annotated[ + str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = CACHE_DIR, + num_fewshot_seeds: Annotated[ + int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1) + ] = 1, + # === saving === + output_dir: Annotated[ + str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = "results", + push_to_hub: Annotated[ + bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + push_to_tensorboard: Annotated[ + bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + public_run: Annotated[ + bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + results_org: Annotated[ + Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = None, + save_details: Annotated[ + bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2) + ] = False, + # === debug === + max_samples: Annotated[ + Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = None, + job_id: Annotated[ + int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3) + ] = 0, +): + """ + Evaluate models using vllm as backend. + """ + from lighteval.logging.evaluation_tracker import EvaluationTracker + from lighteval.models.model_config import VLLMModelConfig + from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters + + TOKEN = os.getenv("HF_TOKEN") + + env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir) + + evaluation_tracker = EvaluationTracker( + output_dir=output_dir, + save_details=save_details, + push_to_hub=push_to_hub, + push_to_tensorboard=push_to_tensorboard, + public=public_run, + hub_results_org=results_org, + ) + + pipeline_params = PipelineParameters( + launcher_type=ParallelismManager.VLLM, + env_config=env_config, + job_id=job_id, + dataset_loading_processes=dataset_loading_processes, + custom_tasks_directory=custom_tasks, + override_batch_size=-1, # Cannot override batch size when using VLLM + num_fewshot_seeds=num_fewshot_seeds, + max_samples=max_samples, + use_chat_template=use_chat_template, + system_prompt=system_prompt, + ) + + model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} + model_config = VLLMModelConfig(**model_args_dict) + + pipeline = Pipeline( + tasks=tasks, + pipeline_parameters=pipeline_params, + evaluation_tracker=evaluation_tracker, + model_config=model_config, + ) + + pipeline.evaluate() + + pipeline.show_results() + + results = pipeline.get_results() + + pipeline.save_and_push_results() + + return results diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index a315d892..2afaead1 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -24,7 +24,6 @@ from typing import Dict, Optional, Union import torch -import yaml from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig from lighteval.logging.hierarchical_logger import hlog @@ -118,6 +117,17 @@ class BaseModelConfig: def __post_init__(self): # Making sure this parameter is a boolean self.multichoice_continuations_start_space = boolstring_to_bool(self.multichoice_continuations_start_space) + + if self.multichoice_continuations_start_space is not None: + if self.multichoice_continuations_start_space: + hlog( + "You set `multichoice_continuations_start_space` to true. This will force multichoice continuations to use a starting space" + ) + else: + hlog( + "You set `multichoice_continuations_start_space` to false. This will remove a leading space from multichoice continuations, if present." + ) + self.model_parallel = boolstring_to_bool(self.model_parallel) self.compile = boolstring_to_bool(self.compile) @@ -297,155 +307,3 @@ def nullable_keys() -> list[str]: that are not required and can remain None. """ return ["namespace", "env_vars", "image_url"] - - -def create_model_config( # noqa: C901 - use_chat_template: bool, - override_batch_size: int, - accelerator: Union["Accelerator", None], - model_args: Union[str, dict] = None, - model_config_path: str = None, -) -> Union[ - BaseModelConfig, - AdapterModelConfig, - DeltaModelConfig, - TGIModelConfig, - InferenceEndpointModelConfig, - DummyModelConfig, - VLLMModelConfig, - OpenAIModelConfig, -]: - """ - Create a model configuration based on the provided arguments. - - Args: - accelerator(Union[Accelerator, None]): accelerator to use for model training. - use_chat_template (bool): whether to use the chat template or not. Set to True for chat or ift models - override_batch_size (int): frozen batch size to use - model_args (Optional[Union[str, dict]]): Parameters to create the model, passed as a string (like the CLI kwargs or dict). - This option only allows to create a dummy model using `dummy` or a base model (using accelerate or no accelerator), in - which case corresponding full model args available are the arguments of the [[BaseModelConfig]]. - Minimal configuration is `pretrained=`. - model_config_path (Optional[str]): Path to the parameters to create the model, passed as a config file. This allows to create - all possible model configurations (base, adapter, peft, inference endpoints, tgi...) - - Returns: - Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig, DummyModelConfig]: model configuration. - - Raises: - ValueError: If both an inference server address and model arguments are provided. - ValueError: If multichoice continuations both should start with a space and should not start with a space. - ValueError: If a base model is not specified when using delta weights or adapter weights. - ValueError: If a base model is specified when not using delta weights or adapter weights. - """ - if model_args is None and model_config_path is None: - raise ValueError("You can't create a model without either a list of model_args or a model_config_path.") - - if model_args: - if isinstance(model_args, str): - model_args = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")} - - if model_args.pop("dummy", False): - return DummyModelConfig(**model_args) - - if model_args.pop("vllm", False): - return VLLMModelConfig(**model_args) - - if model_args.pop("openai", False): - return OpenAIModelConfig(**model_args) - - model_args["accelerator"] = accelerator - model_args["use_chat_template"] = use_chat_template - model_args["compile"] = bool(model_args["compile"]) if "compile" in model_args else False - - return BaseModelConfig(**model_args) - - with open(model_config_path, "r") as f: - config = yaml.safe_load(f)["model"] - - if config["type"] == "tgi": - return TGIModelConfig( - inference_server_address=config["instance"]["inference_server_address"], - inference_server_auth=config["instance"]["inference_server_auth"], - model_id=config["instance"]["model_id"], - ) - - if config["type"] == "endpoint": - reuse_existing_endpoint = config["base_params"].get("reuse_existing", None) - complete_config_endpoint = all( - val not in [None, ""] - for key, val in config.get("instance", {}).items() - if key not in InferenceEndpointModelConfig.nullable_keys() - ) - if reuse_existing_endpoint or complete_config_endpoint: - return InferenceEndpointModelConfig( - name=config["base_params"]["endpoint_name"].replace(".", "-").lower(), - repository=config["base_params"]["model"], - model_dtype=config["base_params"]["dtype"], - revision=config["base_params"]["revision"] or "main", - should_reuse_existing=reuse_existing_endpoint, - accelerator=config["instance"]["accelerator"], - region=config["instance"]["region"], - vendor=config["instance"]["vendor"], - instance_size=config["instance"]["instance_size"], - instance_type=config["instance"]["instance_type"], - namespace=config["instance"]["namespace"], - image_url=config["instance"].get("image_url", None), - env_vars=config["instance"].get("env_vars", None), - ) - return InferenceModelConfig(model=config["base_params"]["endpoint_name"]) - - if config["type"] == "base": - # Creating the multichoice space parameters - # We need to take into account possible conversion issues from our different input formats - multichoice_continuations_start_space = boolstring_to_bool( - config["generation"]["multichoice_continuations_start_space"] - ) - - if multichoice_continuations_start_space is not None: - if multichoice_continuations_start_space: - hlog( - "You set `multichoice_continuations_start_space` to true. This will force multichoice continuations to use a starting space" - ) - else: - hlog( - "You set `multichoice_continuations_start_space` to false. This will remove a leading space from multichoice continuations, if present." - ) - - # Creating optional quantization configuration - if config["base_params"]["dtype"] == "4bit": - quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16) - elif config["base_params"]["dtype"] == "8bit": - quantization_config = BitsAndBytesConfig(load_in_8bit=True) - else: - quantization_config = None - - # We extract the model args - args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")} - - # We store the relevant other args - args_dict["base_model"] = config["merged_weights"]["base_model"] - args_dict["compile"] = bool(config["base_params"]["compile"]) - args_dict["dtype"] = config["base_params"]["dtype"] - args_dict["accelerator"] = accelerator - args_dict["quantization_config"] = quantization_config - args_dict["batch_size"] = override_batch_size - args_dict["multichoice_continuations_start_space"] = multichoice_continuations_start_space - args_dict["use_chat_template"] = use_chat_template - - # Keeping only non null params - args_dict = {k: v for k, v in args_dict.items() if v is not None} - - if config["merged_weights"]["delta_weights"]: - if config["merged_weights"]["base_model"] is None: - raise ValueError("You need to specify a base model when using delta weights") - return DeltaModelConfig(**args_dict) - if config["merged_weights"]["adapter_weights"]: - if config["merged_weights"]["base_model"] is None: - raise ValueError("You need to specify a base model when using adapter weights") - return AdapterModelConfig(**args_dict) - if config["merged_weights"]["base_model"] not in ["", None]: - raise ValueError("You can't specify a base model if you are not using delta/adapter weights") - return BaseModelConfig(**args_dict) - - raise ValueError(f"Unknown model type in your model config file: {config['type']}") diff --git a/src/lighteval/parsers.py b/src/lighteval/parsers.py deleted file mode 100644 index 3988fdbb..00000000 --- a/src/lighteval/parsers.py +++ /dev/null @@ -1,188 +0,0 @@ -# MIT License - -# Copyright (c) 2024 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -import argparse -import os - - -TOKEN = os.getenv("HF_TOKEN") -CACHE_DIR = os.getenv("HF_HOME") - - -def parser_accelerate(parser=None): - if parser is None: - parser = argparse.ArgumentParser( - description="CLI tool for lighteval, a lightweight framework for LLM evaluation" - ) - - group = parser.add_mutually_exclusive_group(required=True) - task_type_group = parser.add_mutually_exclusive_group(required=True) - - # Model type: either use a config file or simply the model name - task_type_group.add_argument( - "--model_config_path", - type=str, - help="Path to the model config file, e.g. 'examples/model_configs/base_model.yaml'", - ) - task_type_group.add_argument( - "--model_args", - type=str, - help="Model arguments to pass to the model class, e.g. 'pretrained=gpt2,dtype=float16'", - ) - - # Debug - parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to evaluate on") - parser.add_argument("--override_batch_size", type=int, default=-1) - parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="") - - # Saving - parser.add_argument( - "--output_dir", - required=True, - type=str, - help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)", - ) - parser.add_argument("--save_details", action="store_true", help="Save the details of the run in the output_dir") - parser.add_argument("--push_to_hub", default=False, action="store_true", help="Set to push the details to the hub") - parser.add_argument("--push_to_tensorboard", default=False, action="store_true") - parser.add_argument( - "--public_run", default=False, action="store_true", help="Push results and details to a public repo" - ) - parser.add_argument( - "--results_org", - type=str, - help="Hub organisation where you want to store the results. Your current token must have write access to it", - default=None, - ) - # Common parameters - parser.add_argument( - "--use_chat_template", - default=False, - action="store_true", - help="Use the chat template (from the model's tokenizer) for the prompt", - ) - parser.add_argument( - "--system_prompt", type=str, default=None, help="System prompt to use, e.g. 'You are a helpful assistant.'" - ) - parser.add_argument( - "--dataset_loading_processes", type=int, default=1, help="Number of processes to use for loading the datasets" - ) - parser.add_argument( - "--custom_tasks", - type=str, - default=None, - help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)", - ) - group.add_argument( - "--tasks", - type=str, - default=None, - help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a texte file with a list of tasks", - ) - parser.add_argument( - "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models" - ) - parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots") - return parser - - -def parser_baseline(parser=None): - if parser is None: - parser = argparse.ArgumentParser( - description="CLI tool for lighteval, a lightweight framework for LLM evaluation" - ) - - parser.add_argument( - "--custom_tasks", - type=str, - default=None, - help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)", - ) - - parser.add_argument( - "--tasks", - type=str, - required=True, - help="Task to compute the baseline for", - ) - parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to evaluate on") - parser.add_argument( - "--dataset_loading_processes", type=int, default=1, help="Number of processes to use for loading the datasets" - ) - - parser.add_argument( - "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models" - ) - # Ooutput related - parser.add_argument( - "--output_dir", - required=True, - type=str, - help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)", - ) - - return parser - - -def parser_nanotron(parser=None): - if parser is None: - parser = argparse.ArgumentParser( - description="CLI tool for lighteval, a lightweight framework for LLM evaluation" - ) - - parser.add_argument( - "--checkpoint_config_path", - type=str, - required=True, - help="Path to the nanotron checkpoint YAML or python config file, potentially on S3", - ) - parser.add_argument( - "--lighteval_config_path", - type=str, - help="Path to a YAML or python lighteval config to be used for the evaluation. Lighteval key in nanotron config is ignored!", - required=True, - ) - parser.add_argument( - "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models" - ) - - -def parser_utils_tasks(parser=None): - if parser is None: - parser = argparse.ArgumentParser( - description="CLI tool for lighteval, a lightweight framework for LLM evaluation" - ) - - group = parser.add_mutually_exclusive_group(required=True) - - group.add_argument("--list", action="store_true", help="List available tasks") - group.add_argument( - "--inspect", - type=str, - default=None, - help="Id of tasks or path to a text file with a list of tasks (e.g. 'original|mmlu:abstract_algebra|5') for which you want to manually inspect samples.", - ) - parser.add_argument("--custom_tasks", type=str, default=None, help="Path to a file with custom tasks") - parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to display") - parser.add_argument("--show_config", default=False, action="store_true", help="Will display the full task config") - parser.add_argument( - "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models" - ) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index da4fb045..e429e519 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -42,10 +42,14 @@ from lighteval.utils.imports import ( NO_ACCELERATE_ERROR_MSG, NO_NANOTRON_ERROR_MSG, + NO_OPENAI_ERROR_MSG, NO_TGI_ERROR_MSG, + NO_VLLM_ERROR_MSG, is_accelerate_available, is_nanotron_available, + is_openai_available, is_tgi_available, + is_vllm_available, ) from lighteval.utils.parallelism import test_all_gather from lighteval.utils.utils import EnvConfig, make_results_table @@ -65,6 +69,8 @@ class ParallelismManager(Enum): ACCELERATE = auto() NANOTRON = auto() TGI = auto() + OPENAI = auto() + VLLM = auto() NONE = auto() @@ -85,16 +91,22 @@ class PipelineParameters: use_chat_template: bool = False system_prompt: str | None = None - def __post_init__(self): + def __post_init__(self): # noqa C901 if self.launcher_type == ParallelismManager.ACCELERATE: if not is_accelerate_available(): raise ImportError(NO_ACCELERATE_ERROR_MSG) + elif self.launcher_type == ParallelismManager.VLLM: + if not is_vllm_available(): + raise ImportError(NO_VLLM_ERROR_MSG) elif self.launcher_type == ParallelismManager.TGI: if not is_tgi_available(): raise ImportError(NO_TGI_ERROR_MSG) elif self.launcher_type == ParallelismManager.NANOTRON: if not is_nanotron_available(): raise ImportError(NO_NANOTRON_ERROR_MSG) + elif self.launcher_type == ParallelismManager.OPENAI: + if not is_openai_available(): + raise ImportError(NO_OPENAI_ERROR_MSG) class Pipeline: diff --git a/tests/test_main.py b/tests/test_main.py index 32211d00..d0c85b33 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -28,8 +28,7 @@ import pytest from pytest import approx -from lighteval.main_accelerate import main # noqa: E402 -from lighteval.parsers import parser_accelerate +from lighteval.main_accelerate import accelerate # noqa: E402 from tests.reference_scores.reference_task_scores import RESULTS_FULL, RESULTS_LITE # noqa: E402 from tests.reference_scores.reference_tasks import ALL_SUBSETS @@ -37,10 +36,6 @@ # Set env var for deterministic run of models os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" -# Set cache for github actions -os.environ["HF_DATASETS_CACHE"] = "cache/datasets/" -os.environ["HF_HOME"] = "cache/models/" - # To add new models or tasks, change here # ! The correct results must be present in reference_task_scores MODELS = ["gpt2"] @@ -53,39 +48,29 @@ @lru_cache(maxsize=len(MODELS)) def run_model_predictions_full(model: str, tasks: tuple): """Runs the full main as a black box, using the input model and tasks, on all samples without parallelism""" - lighteval_args = ["--model_args", f"pretrained={model}", "--tasks", ",".join(tasks)] - lighteval_args += [ - "--override_batch_size", - "1", - "--output_dir", - "", - "--dataset_loading_processes", - "1", - "--save_details", - ] - parser = parser_accelerate() - args = parser.parse_args(lighteval_args) - results = main(args) + results = accelerate( + model_args=f"pretrained={model}", + tasks=",".join(tasks), + override_batch_size=1, + output_dir="", + dataset_loading_processes=1, + save_details=True, + ) return results @lru_cache(maxsize=len(MODELS)) def run_model_predictions_lite(model: str, tasks: tuple): """Runs the full main as a black box, using the input model and tasks, on 10 samples without parallelism""" - lighteval_args = ["--model_args", f"pretrained={model}", "--tasks", ",".join(tasks)] - lighteval_args += [ - "--override_batch_size", - "1", - "--output_dir", - "", - "--dataset_loading_processes", - "1", - "--save_details", - ] - lighteval_args += ["--max_samples", "10"] - parser = parser_accelerate() - args = parser.parse_args(lighteval_args) - results = main(args) + results = accelerate( + model_args=f"pretrained={model}", + tasks=",".join(tasks), + override_batch_size=1, + output_dir="", + dataset_loading_processes=1, + save_details=True, + max_samples=10, + ) return results