Merge branch 'main' of https://github.com/ParagEkbote/lighteval

huggingface · Jan 11, 2025 · dda8266 · dda8266
2 parents 4f7ee64 + 6560c75
commit dda8266
Show file tree

Hide file tree

Showing 28 changed files with 939 additions and 138 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -25,7 +25,7 @@ jobs:
          cache: 'pip'
      - name: Install lighteval in editable mode
        run: |
-         pip install -e .[dev,extended_tasks,multilingual]
+         pip install -e .[dev,extended_tasks,multilingual,litellm]
      - name: Get cached files
        uses: actions/cache@v4
        id: get-cache

diff --git a/README.md b/README.md
@@ -44,12 +44,12 @@ Hub, S3, or locally.
 
 ## 🔑 Key Features
 
-- **Speed**: [Use vllm as backend for fast evals](https://github.com/huggingface/lighteval/wiki/Use-VLLM-as-backend).
-- **Completeness**: [Use the accelerate backend to launch any models hosted on Hugging Face](https://github.com/huggingface/lighteval/wiki/Quicktour#accelerate).
-- **Seamless Storage**: [Save results in S3 or Hugging Face Datasets](https://github.com/huggingface/lighteval/wiki/Saving-and-reading-results).
-- **Python API**: [Simple integration with the Python API](https://github.com/huggingface/lighteval/wiki/Using-the-Python-API).
-- **Custom Tasks**: [Easily add custom tasks](https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task).
-- **Versatility**: Tons of [metrics](https://github.com/huggingface/lighteval/wiki/Metric-List) and [tasks](https://github.com/huggingface/lighteval/wiki/Available-Tasks) ready to go.
+- **Speed**: [Use vllm as backend for fast evals](https://huggingface.co/docs/lighteval/use-vllm-as-backend).
+- **Completeness**: [Use the accelerate backend to launch any models hosted on Hugging Face](https://huggingface.co/docs/lighteval/quicktour#accelerate).
+- **Seamless Storage**: [Save results in S3 or Hugging Face Datasets](https://huggingface.co/docs/lighteval/saving-and-reading-results).
+- **Python API**: [Simple integration with the Python API](https://huggingface.co/docs/lighteval/using-the-python-api).
+- **Custom Tasks**: [Easily add custom tasks](https://huggingface.co/docs/lighteval/adding-a-custom-task).
+- **Versatility**: Tons of [metrics](https://huggingface.co/docs/lighteval/metric-list) and [tasks](https://huggingface.co/docs/lighteval/available-tasks) ready to go.
 
 
 ## ⚡️ Installation
@@ -58,7 +58,7 @@ Hub, S3, or locally.
 pip install lighteval
 ```
 
-Lighteval allows for many extras when installing, see [here](https://github.com/huggingface/lighteval/wiki/Installation) for a complete list.
+Lighteval allows for many extras when installing, see [here](https://huggingface.co/docs/lighteval/installation) for a complete list.
 
 If you want to push results to the Hugging Face Hub, add your access token as
 an environment variable:
@@ -106,8 +106,8 @@ Harness and HELM teams for their pioneering work on LLM evaluations.
 ## 🌟 Contributions Welcome 💙💚💛💜🧡
 
 Got ideas? Found a bug? Want to add a
-[task](https://github.com/huggingface/lighteval/wiki/Adding-a-Custom-Task) or
-[metric](https://github.com/huggingface/lighteval/wiki/Adding-a-New-Metric)?
+[task](https://huggingface.co/docs/lighteval/adding-a-custom-task) or
+[metric](https://huggingface.co/docs/lighteval/adding-a-new-metric)?
 Contributions are warmly welcomed!
 
 If you're adding a new feature, please open an issue first.

diff --git a/docs/source/package_reference/models.mdx b/docs/source/package_reference/models.mdx
@@ -6,9 +6,9 @@
 
 
 ## Accelerate and Transformers Models
-### BaseModel
-[[autodoc]] models.transformers.base_model.BaseModelConfig
-[[autodoc]] models.transformers.base_model.BaseModel
+### TransformersModel
+[[autodoc]] models.transformers.transformers_model.TransformersModelConfig
+[[autodoc]] models.transformers.transformers_model.TransformersModel
 
 ### AdapterModel
 [[autodoc]] models.transformers.adapter_model.AdapterModelConfig

diff --git a/examples/model_configs/base_model.yaml → ...les/model_configs/transformers_model.yaml b/examples/model_configs/base_model.yaml → ...les/model_configs/transformers_model.yaml
@@ -1,6 +1,6 @@
 model:
   base_params:
-    model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True. To see the full list of parameters, please click here: https://huggingface.co/docs/lighteval/main/en/quicktour#model-arguments
+    model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
     dtype: "bfloat16"
     compile: true
   merged_weights: # Ignore this section if you are not using PEFT models

diff --git a/pyproject.toml b/pyproject.toml
@@ -82,6 +82,7 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+litellm = ["litellm", "diskcache"]
 tgi = ["text-generation==0.6.0"]
 optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]

diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import logging
-from logging.config import dictConfig
+import logging.config
 
 import colorlog
 import typer
@@ -57,7 +57,8 @@
     },
 )
 
-dictConfig(logging_config)
+logging.config.dictConfig(logging_config)
+logging.captureWarnings(capture=True)
 
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate)
 app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline)

diff --git a/src/lighteval/data.py b/src/lighteval/data.py
@@ -25,8 +25,15 @@
 from typing import Iterator, Tuple
 
 import torch
+from packaging import version
 from torch.utils.data import Dataset
-from torch.utils.data.distributed import DistributedSampler, T_co
+
+
+if version.parse(torch.__version__) >= version.parse("2.5.0"):
+    from torch.utils.data.distributed import DistributedSampler, _T_co
+else:
+    from torch.utils.data.distributed import DistributedSampler
+    from torch.utils.data.distributed import T_co as _T_co
 
 from lighteval.tasks.requests import (
     GreedyUntilRequest,
@@ -318,7 +325,7 @@ class GenDistributedSampler(DistributedSampler):
     as our samples are sorted by length.
     """
 
-    def __iter__(self) -> Iterator[T_co]:
+    def __iter__(self) -> Iterator[_T_co]:
         if self.shuffle:
             # deterministically shuffle based on epoch and seed
             g = torch.Generator()

diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -44,7 +44,7 @@ def accelerate(  # noqa C901
     model_args: Annotated[
         str,
         Argument(
-            help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/base_model.yaml)"
+            help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)"
         ),
     ],
     tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
@@ -107,9 +107,10 @@ def accelerate(  # noqa C901
     from accelerate import Accelerator, InitProcessGroupKwargs
 
     from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_input import GenerationParameters
     from lighteval.models.transformers.adapter_model import AdapterModelConfig
-    from lighteval.models.transformers.base_model import BaseModelConfig, BitsAndBytesConfig
     from lighteval.models.transformers.delta_model import DeltaModelConfig
+    from lighteval.models.transformers.transformers_model import BitsAndBytesConfig, TransformersModelConfig
     from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
 
     accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
@@ -154,6 +155,8 @@ def accelerate(  # noqa C901
         # We extract the model args
         args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")}
 
+        args_dict["generation_parameters"] = GenerationParameters.from_dict(config)
+
         # We store the relevant other args
         args_dict["base_model"] = config["merged_weights"]["base_model"]
         args_dict["compile"] = bool(config["base_params"]["compile"])
@@ -180,13 +183,13 @@ def accelerate(  # noqa C901
         elif config["merged_weights"]["base_model"] not in ["", None]:
             raise ValueError("You can't specify a base model if you are not using delta/adapter weights")
         else:
-            model_config = BaseModelConfig(**args_dict)
+            model_config = TransformersModelConfig(**args_dict)
     else:
         model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
         model_args_dict["accelerator"] = accelerator
         model_args_dict["use_chat_template"] = use_chat_template
         model_args_dict["compile"] = bool(model_args_dict["compile"]) if "compile" in model_args_dict else False
-        model_config = BaseModelConfig(**model_args_dict)
+        model_config = TransformersModelConfig(**model_args_dict)
 
     pipeline = Pipeline(
         tasks=tasks,

diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
@@ -42,8 +42,11 @@
 @app.command(rich_help_panel="Evaluation Backends")
 def openai(
     # === general ===
-    model_name: Annotated[
-        str, Argument(help="The model name to evaluate (has to be available through the openai API.")
+    model_args: Annotated[
+        str,
+        Argument(
+            help="Model name as a string (has to be available through the openai API) or path to yaml config file (see examples/model_configs/transformers_model.yaml)"
+        ),
     ],
     tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
     # === Common parameters ===
@@ -96,6 +99,11 @@ def openai(
     from lighteval.models.endpoints.openai_model import OpenAIModelConfig
     from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
 
+    if model_args.endswith(".yaml"):
+        model_config = OpenAIModelConfig.from_path(model_args)
+    else:
+        model_config = OpenAIModelConfig(model=model_args)
+
     env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
@@ -107,7 +115,6 @@ def openai(
     )
 
     parallelism_manager = ParallelismManager.OPENAI
-    model_config = OpenAIModelConfig(model=model_name)
 
     pipeline_params = PipelineParameters(
         launcher_type=parallelism_manager,
@@ -205,7 +212,6 @@ def inference_endpoint(
     """
     Evaluate models using inference-endpoints as backend.
     """
-
     from lighteval.logging.evaluation_tracker import EvaluationTracker
     from lighteval.models.endpoints.endpoint_model import InferenceEndpointModelConfig, ServerlessEndpointModelConfig
     from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
@@ -319,7 +325,6 @@ def tgi(
     """
     Evaluate models using TGI as backend.
     """
-
     from lighteval.logging.evaluation_tracker import EvaluationTracker
     from lighteval.models.endpoints.tgi_model import TGIModelConfig
     from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
@@ -367,3 +372,112 @@ def tgi(
     pipeline.save_and_push_results()
 
     return results
+
+
+@app.command(rich_help_panel="Evaluation Backends")
+def litellm(
+    # === general ===
+    model_name: Annotated[
+        str, Argument(help="The model name to evaluate (has to be available through the litellm API.")
+    ],
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    # === Common parameters ===
+    use_chat_template: Annotated[
+        bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
+    ] = False,
+    system_prompt: Annotated[
+        Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANEL_NAME_4)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = 1,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = None,
+    cache_dir: Annotated[
+        str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = CACHE_DIR,
+    num_fewshot_seeds: Annotated[
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1)
+    ] = 1,
+    # === saving ===
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = "results",
+    push_to_hub: Annotated[
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = False,
+    push_to_tensorboard: Annotated[
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = False,
+    public_run: Annotated[
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = False,
+    results_org: Annotated[
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = None,
+    save_details: Annotated[
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
+    ] = False,
+    # === debug ===
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = None,
+    override_batch_size: Annotated[
+        int, Option(help="Override batch size for evaluation.", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = -1,
+    job_id: Annotated[
+        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANEL_NAME_3)
+    ] = 0,
+):
+    """
+    Evaluate models using LiteLLM as backend.
+    """
+
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.litellm_model import LiteLLMModelConfig
+    from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
+
+    env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
+    evaluation_tracker = EvaluationTracker(
+        output_dir=output_dir,
+        save_details=save_details,
+        push_to_hub=push_to_hub,
+        push_to_tensorboard=push_to_tensorboard,
+        public=public_run,
+        hub_results_org=results_org,
+    )
+
+    # TODO (nathan): better handling of model_args
+    parallelism_manager = ParallelismManager.NONE
+
+    model_config = LiteLLMModelConfig(model=model_name)
+
+    pipeline_params = PipelineParameters(
+        launcher_type=parallelism_manager,
+        env_config=env_config,
+        job_id=job_id,
+        dataset_loading_processes=dataset_loading_processes,
+        custom_tasks_directory=custom_tasks,
+        override_batch_size=override_batch_size,
+        num_fewshot_seeds=num_fewshot_seeds,
+        max_samples=max_samples,
+        use_chat_template=use_chat_template,
+        system_prompt=system_prompt,
+    )
+    pipeline = Pipeline(
+        tasks=tasks,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+
+    pipeline.evaluate()
+
+    pipeline.show_results()
+
+    results = pipeline.get_results()
+
+    pipeline.save_and_push_results()
+
+    return results
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
@@ -37,7 +37,12 @@
 
 def vllm(
     # === general ===
-    model_args: Annotated[str, Argument(help="Model arguments in the form key1=value1,key2=value2,...")],
+    model_args: Annotated[
+        str,
+        Argument(
+            help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/transformers_model.yaml)"
+        ),
+    ],
     tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
     # === Common parameters ===
     use_chat_template: Annotated[
@@ -88,7 +93,10 @@ def vllm(
     """
     Evaluate models using vllm as backend.
     """
+    import yaml
+
     from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_input import GenerationParameters
     from lighteval.models.vllm.vllm_model import VLLMModelConfig
     from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
 
@@ -118,8 +126,15 @@ def vllm(
         system_prompt=system_prompt,
     )
 
-    model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
-    model_config = VLLMModelConfig(**model_args_dict)
+    if model_args.endswith(".yaml"):
+        with open(model_args, "r") as f:
+            config = yaml.safe_load(f)["model"]
+        generation_parameters = GenerationParameters.from_dict(config)
+        model_config = VLLMModelConfig(config, generation_parameters=generation_parameters)
+
+    else:
+        model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
+        model_config = VLLMModelConfig(**model_args_dict)
 
     pipeline = Pipeline(
         tasks=tasks,