From a95156e8145b52573dbf877be219ee548f1858fd Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Tue, 7 Jan 2025 04:22:34 +0000 Subject: [PATCH 1/8] Implemented the possibility to load predictions from details files and continue evaluating from there. --- src/lighteval/logging/evaluation_tracker.py | 31 +++++++++- src/lighteval/main_accelerate.py | 4 ++ src/lighteval/main_endpoint.py | 12 ++++ src/lighteval/main_vllm.py | 4 ++ src/lighteval/pipeline.py | 66 +++++++++++++++++++-- 5 files changed, 110 insertions(+), 7 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 6cad9189f..ec036657c 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -209,9 +209,36 @@ def save_results(self, date_id: str, results_dict: dict): with self.fs.open(output_results_file, "w") as f: f.write(json.dumps(results_dict, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False)) - def save_details(self, date_id: str, details_datasets: dict[str, Dataset]): + def _get_details_sub_folder(self, date_id: str): output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name - output_dir_details_sub_folder = output_dir_details / date_id + if date_id == "latest": + # Get all folders in output_dir_details + if not self.fs.exists(output_dir_details): + raise FileNotFoundError(f"Details directory {output_dir_details} does not exist") + + # List all folders and filter out files + folders = [f['name'] for f in self.fs.listdir(output_dir_details) if f['type'] == 'directory'] + + if not folders: + raise FileNotFoundError(f"No timestamp folders found in {output_dir_details}") + + # Parse timestamps and get latest + date_id = max(folders) + return output_dir_details / date_id + + def load_details_datasets(self, date_id: str) -> dict[str, Dataset]: + output_dir_details_sub_folder = self._get_details_sub_folder(date_id) + date_id = output_dir_details_sub_folder.name # Overwrite date_id in case of latest + details_datasets = {} + for file in self.fs.glob(str(output_dir_details_sub_folder / f"details_*_{date_id}.parquet")): + task_name = Path(file).stem.replace(f"details_", "").replace(f"_{date_id}", "") + dataset = load_dataset("parquet", data_files=file, split="train") + details_datasets[task_name] = dataset + return details_datasets + + + def save_details(self, date_id: str, details_datasets: dict[str, Dataset]): + output_dir_details_sub_folder = self._get_details_sub_folder(date_id) self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True) logger.info(f"Saving details to {output_dir_details_sub_folder}") for task_name, dataset in details_datasets.items(): diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index fe7f98d6f..d8d69f30f 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -67,6 +67,9 @@ def accelerate( # noqa C901 num_fewshot_seeds: Annotated[ int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) ] = 1, + load_responses_from_details_date_id: Annotated[ + Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) + ] = None, # === saving === output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) @@ -137,6 +140,7 @@ def accelerate( # noqa C901 max_samples=max_samples, use_chat_template=use_chat_template, system_prompt=system_prompt, + load_responses_from_details_date_id=load_responses_from_details_date_id, ) # TODO (nathan): better handling of model_args diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py index 2c51fe15f..858cdcde3 100644 --- a/src/lighteval/main_endpoint.py +++ b/src/lighteval/main_endpoint.py @@ -179,6 +179,9 @@ def inference_endpoint( num_fewshot_seeds: Annotated[ int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) ] = 1, + load_responses_from_details_date_id: Annotated[ + Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) + ] = None, # === saving === output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) @@ -247,6 +250,7 @@ def inference_endpoint( max_samples=max_samples, use_chat_template=use_chat_template, system_prompt=system_prompt, + load_responses_from_details_date_id=load_responses_from_details_date_id, ) pipeline = Pipeline( tasks=tasks, @@ -292,6 +296,9 @@ def tgi( num_fewshot_seeds: Annotated[ int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) ] = 1, + load_responses_from_details_date_id: Annotated[ + Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) + ] = None, # === saving === output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) @@ -355,6 +362,7 @@ def tgi( max_samples=max_samples, use_chat_template=use_chat_template, system_prompt=system_prompt, + load_responses_from_details_date_id=load_responses_from_details_date_id, ) pipeline = Pipeline( tasks=tasks, @@ -400,6 +408,9 @@ def litellm( num_fewshot_seeds: Annotated[ int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) ] = 1, + load_responses_from_details_date_id: Annotated[ + Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) + ] = None, # === saving === output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) @@ -464,6 +475,7 @@ def litellm( max_samples=max_samples, use_chat_template=use_chat_template, system_prompt=system_prompt, + load_responses_from_details_date_id=load_responses_from_details_date_id, ) pipeline = Pipeline( tasks=tasks, diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py index 89311b5ae..d063c3fa8 100644 --- a/src/lighteval/main_vllm.py +++ b/src/lighteval/main_vllm.py @@ -63,6 +63,9 @@ def vllm( num_fewshot_seeds: Annotated[ int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANEL_NAME_1) ] = 1, + load_responses_from_details_date_id: Annotated[ + Optional[str], Option(help="Load responses from details directory.", rich_help_panel=HELP_PANEL_NAME_1) + ] = None, # === saving === output_dir: Annotated[ str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANEL_NAME_2) @@ -124,6 +127,7 @@ def vllm( max_samples=max_samples, use_chat_template=use_chat_template, system_prompt=system_prompt, + load_responses_from_details_date_id=load_responses_from_details_date_id, ) if model_args.endswith(".yaml"): diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 6a40d2801..f432fe14a 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import ast import collections import os import random @@ -34,10 +35,10 @@ from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.metrics.utils.metric_utils import MetricCategory from lighteval.models.model_loader import TransformersModel, load_model -from lighteval.models.model_output import ModelResponse +from lighteval.models.model_output import GenerativeMultiturnResponse, GenerativeResponse, LoglikelihoodResponse, LoglikelihoodSingleTokenResponse, ModelResponse from lighteval.tasks.lighteval_task import LightevalTask, create_requests_from_tasks from lighteval.tasks.registry import Registry, taskinfo_selector -from lighteval.tasks.requests import SampleUid +from lighteval.tasks.requests import RequestType, SampleUid from lighteval.utils.imports import ( NO_ACCELERATE_ERROR_MSG, NO_NANOTRON_ERROR_MSG, @@ -95,6 +96,7 @@ class PipelineParameters: max_samples: int | None = None use_chat_template: bool = False system_prompt: str | None = None + load_responses_from_details_date_id: str | None = None def __post_init__(self): # noqa C901 if self.launcher_type == ParallelismManager.ACCELERATE: @@ -245,7 +247,11 @@ def evaluate(self): config=self.model_config, ) - sample_id_to_responses = self._run_model() + if self.pipeline_parameters.load_responses_from_details_date_id: + sample_id_to_responses = self._load_responses_from_details() + else: + sample_id_to_responses = self._run_model() + self._compute_metrics(sample_id_to_responses) if self.is_main_process(): @@ -261,6 +267,53 @@ def evaluate(self): except OSError: pass + + def _load_responses_from_details(self): + logger.info("--- LOADING RESPONSES FROM DETAILS ---") + sample_id_to_responses: dict[(SampleUid, MetricCategory), list[ModelResponse]] = collections.defaultdict(list) + + request_types = list(self.requests.keys()) + if len(request_types) > 1: + raise ValueError("Loading responses from details when there are multiple request types is currently not supported") + request_type = request_types[0] + if request_type == RequestType.LOGLIKELIHOOD: + model_response_type = LoglikelihoodResponse + elif request_type == RequestType.LOGLIKELIHOOD_SINGLE_TOKEN: + model_response_type = LoglikelihoodSingleTokenResponse + elif request_type == RequestType.LOGLIKELIHOOD_ROLLING: + model_response_type = LoglikelihoodResponse + elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN: + model_response_type = GenerativeMultiturnResponse + elif request_type == RequestType.GREEDY_UNTIL: + model_response_type = GenerativeResponse + else: + raise ValueError(f"Loading responses from details for request type {request_type} is currently not supported") + + details_datasets = self.evaluation_tracker.load_details_datasets(self.pipeline_parameters.load_responses_from_details_date_id) + for task_name, dataset in details_datasets.items(): + task: LightevalTask = self._get_task(task_name) + num_samples = len(dataset["predictions"]) + max_samples = self.pipeline_parameters.max_samples if self.pipeline_parameters.max_samples else num_samples + if num_samples > max_samples: + logger.warning(f"Skipping {num_samples - max_samples} samples for {task_name} when loading responses from details because max_samples is set to {max_samples}") + num_samples = self.pipeline_parameters.max_samples + for metric_category, has_metric_category in task.has_metric_category.items(): + if not has_metric_category: + continue + for idx in range(num_samples): + kwargs = { + "result": ast.literal_eval(dataset["predictions"][idx]), + "input_tokens": ast.literal_eval(dataset["input_tokens"][idx]), + "generated_tokens": ast.literal_eval(dataset["cont_tokens"][idx]), + "truncated_tokens_count": ast.literal_eval(dataset["truncated"][idx])[0], + "padded_tokens_count": ast.literal_eval(dataset["padded"][idx])[0] + } + if model_response_type == GenerativeResponse: + kwargs["logits"] = ast.literal_eval(dataset["pred_logits"][idx]) + response = model_response_type(**kwargs) + sample_id_to_responses[(SampleUid(task_name, f"{idx}_{0}"), metric_category)] = [response] + return sample_id_to_responses + def _run_model(self): # Running all requests depending on the model call type (log likelihood, generative, ...) # to be able to batch them @@ -283,6 +336,10 @@ def _run_model(self): return sample_id_to_responses + def _get_task(self, task_name: str): + short_task_name = task_name.rsplit("|", 1)[0] + return self.task_dict[short_task_name] + def _compute_metrics(self, sample_id_to_responses): # To compute the metrics we first group the samples and task and then by metrics. # This way we can batch the metrics computation for each task and metric category @@ -307,8 +364,7 @@ def _compute_metrics(self, sample_id_to_responses): task_metric_category_groups[sample_id.task_name][metric_category]["docs"].append(self.docs[sample_id]) for task_name, samples_per_metric in task_metric_category_groups.items(): - short_task_name = task_name.rsplit("|", 1)[0] - task: LightevalTask = self.task_dict[short_task_name] + task: LightevalTask = self._get_task(task_name) for metric_category, samples in samples_per_metric.items(): sample_ids = samples["ids"] From 66244528c7015bbc33e0f457320b029f1bcd362a Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Mon, 6 Jan 2025 22:11:04 -0800 Subject: [PATCH 2/8] Run model as fallback when no details can be loaded. --- src/lighteval/pipeline.py | 63 ++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 20 deletions(-) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index f432fe14a..69ea41c74 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -35,7 +35,13 @@ from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.metrics.utils.metric_utils import MetricCategory from lighteval.models.model_loader import TransformersModel, load_model -from lighteval.models.model_output import GenerativeMultiturnResponse, GenerativeResponse, LoglikelihoodResponse, LoglikelihoodSingleTokenResponse, ModelResponse +from lighteval.models.model_output import ( + GenerativeMultiturnResponse, + GenerativeResponse, + LoglikelihoodResponse, + LoglikelihoodSingleTokenResponse, + ModelResponse, +) from lighteval.tasks.lighteval_task import LightevalTask, create_requests_from_tasks from lighteval.tasks.registry import Registry, taskinfo_selector from lighteval.tasks.requests import RequestType, SampleUid @@ -248,7 +254,13 @@ def evaluate(self): ) if self.pipeline_parameters.load_responses_from_details_date_id: - sample_id_to_responses = self._load_responses_from_details() + try: + sample_id_to_responses = self._load_responses_from_details() + except FileNotFoundError as e: + logger.warning( + f"No responses found for {self.pipeline_parameters.load_responses_from_details_date_id} in details directory: {e}. Running model instead." + ) + sample_id_to_responses = self._run_model() else: sample_id_to_responses = self._run_model() @@ -267,35 +279,28 @@ def evaluate(self): except OSError: pass - def _load_responses_from_details(self): logger.info("--- LOADING RESPONSES FROM DETAILS ---") sample_id_to_responses: dict[(SampleUid, MetricCategory), list[ModelResponse]] = collections.defaultdict(list) request_types = list(self.requests.keys()) if len(request_types) > 1: - raise ValueError("Loading responses from details when there are multiple request types is currently not supported") - request_type = request_types[0] - if request_type == RequestType.LOGLIKELIHOOD: - model_response_type = LoglikelihoodResponse - elif request_type == RequestType.LOGLIKELIHOOD_SINGLE_TOKEN: - model_response_type = LoglikelihoodSingleTokenResponse - elif request_type == RequestType.LOGLIKELIHOOD_ROLLING: - model_response_type = LoglikelihoodResponse - elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN: - model_response_type = GenerativeMultiturnResponse - elif request_type == RequestType.GREEDY_UNTIL: - model_response_type = GenerativeResponse - else: - raise ValueError(f"Loading responses from details for request type {request_type} is currently not supported") + raise ValueError( + "Loading responses from details when there are multiple request types is currently not supported" + ) + model_response_type = self._get_model_response_type(request_types[0]) - details_datasets = self.evaluation_tracker.load_details_datasets(self.pipeline_parameters.load_responses_from_details_date_id) + details_datasets = self.evaluation_tracker.load_details_datasets( + self.pipeline_parameters.load_responses_from_details_date_id + ) for task_name, dataset in details_datasets.items(): task: LightevalTask = self._get_task(task_name) num_samples = len(dataset["predictions"]) max_samples = self.pipeline_parameters.max_samples if self.pipeline_parameters.max_samples else num_samples if num_samples > max_samples: - logger.warning(f"Skipping {num_samples - max_samples} samples for {task_name} when loading responses from details because max_samples is set to {max_samples}") + logger.warning( + f"Skipping {num_samples - max_samples} samples for {task_name} when loading responses from details because max_samples is set to {max_samples}" + ) num_samples = self.pipeline_parameters.max_samples for metric_category, has_metric_category in task.has_metric_category.items(): if not has_metric_category: @@ -306,7 +311,7 @@ def _load_responses_from_details(self): "input_tokens": ast.literal_eval(dataset["input_tokens"][idx]), "generated_tokens": ast.literal_eval(dataset["cont_tokens"][idx]), "truncated_tokens_count": ast.literal_eval(dataset["truncated"][idx])[0], - "padded_tokens_count": ast.literal_eval(dataset["padded"][idx])[0] + "padded_tokens_count": ast.literal_eval(dataset["padded"][idx])[0], } if model_response_type == GenerativeResponse: kwargs["logits"] = ast.literal_eval(dataset["pred_logits"][idx]) @@ -314,6 +319,24 @@ def _load_responses_from_details(self): sample_id_to_responses[(SampleUid(task_name, f"{idx}_{0}"), metric_category)] = [response] return sample_id_to_responses + def _get_model_response_type(self, request_type): + if request_type == RequestType.LOGLIKELIHOOD: + model_response_type = LoglikelihoodResponse + elif request_type == RequestType.LOGLIKELIHOOD_SINGLE_TOKEN: + model_response_type = LoglikelihoodSingleTokenResponse + elif request_type == RequestType.LOGLIKELIHOOD_ROLLING: + model_response_type = LoglikelihoodResponse + elif request_type == RequestType.GREEDY_UNTIL_MULTI_TURN: + model_response_type = GenerativeMultiturnResponse + elif request_type == RequestType.GREEDY_UNTIL: + model_response_type = GenerativeResponse + else: + raise ValueError( + f"Loading responses from details for request type {request_type} is currently not supported" + ) + + return model_response_type + def _run_model(self): # Running all requests depending on the model call type (log likelihood, generative, ...) # to be able to batch them From 742a672490aa3d96604adcc848a4eff809c59762 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Fri, 10 Jan 2025 20:27:44 -0800 Subject: [PATCH 3/8] Improved loading speed and added more useful error messages. --- src/lighteval/logging/evaluation_tracker.py | 25 +++++++++++------ src/lighteval/pipeline.py | 30 +++++++++++++++------ 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index ec036657c..37c6ab480 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -215,28 +215,37 @@ def _get_details_sub_folder(self, date_id: str): # Get all folders in output_dir_details if not self.fs.exists(output_dir_details): raise FileNotFoundError(f"Details directory {output_dir_details} does not exist") - + # List all folders and filter out files - folders = [f['name'] for f in self.fs.listdir(output_dir_details) if f['type'] == 'directory'] - + folders = [f["name"] for f in self.fs.listdir(output_dir_details) if f["type"] == "directory"] + if not folders: raise FileNotFoundError(f"No timestamp folders found in {output_dir_details}") - + # Parse timestamps and get latest date_id = max(folders) return output_dir_details / date_id - def load_details_datasets(self, date_id: str) -> dict[str, Dataset]: + def load_details_datasets(self, date_id: str, task_names: list[str]) -> dict[str, Dataset]: output_dir_details_sub_folder = self._get_details_sub_folder(date_id) - date_id = output_dir_details_sub_folder.name # Overwrite date_id in case of latest + logger.info(f"Loading details from {output_dir_details_sub_folder}") + date_id = output_dir_details_sub_folder.name # Overwrite date_id in case of latest details_datasets = {} for file in self.fs.glob(str(output_dir_details_sub_folder / f"details_*_{date_id}.parquet")): - task_name = Path(file).stem.replace(f"details_", "").replace(f"_{date_id}", "") + task_name = Path(file).stem.replace("details_", "").replace(f"_{date_id}", "") + if "|".join(task_name.split("|")[:-1]) not in task_names: + logger.info(f"Skipping {task_name} because it is not in the task_names list") + continue dataset = load_dataset("parquet", data_files=file, split="train") details_datasets[task_name] = dataset + + for task_name in task_names: + if not any(task_name.startswith(task_name) for task_name in details_datasets.keys()): + raise ValueError( + f"Task {task_name} not found in details datasets. Check the tasks to be evaluated or the date_id used to load the details ({self.pipeline_parameters.load_responses_from_details_date_id})." + ) return details_datasets - def save_details(self, date_id: str, details_datasets: dict[str, Dataset]): output_dir_details_sub_folder = self._get_details_sub_folder(date_id) self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 69ea41c74..f023cd35d 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -31,6 +31,7 @@ from enum import Enum, auto import numpy as np +from tqdm import tqdm from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.metrics.utils.metric_utils import MetricCategory @@ -291,9 +292,10 @@ def _load_responses_from_details(self): model_response_type = self._get_model_response_type(request_types[0]) details_datasets = self.evaluation_tracker.load_details_datasets( - self.pipeline_parameters.load_responses_from_details_date_id + self.pipeline_parameters.load_responses_from_details_date_id, self.task_names_list ) - for task_name, dataset in details_datasets.items(): + + for task_name, dataset in tqdm(details_datasets.items(), desc="Loading responses from details for tasks"): task: LightevalTask = self._get_task(task_name) num_samples = len(dataset["predictions"]) max_samples = self.pipeline_parameters.max_samples if self.pipeline_parameters.max_samples else num_samples @@ -305,16 +307,28 @@ def _load_responses_from_details(self): for metric_category, has_metric_category in task.has_metric_category.items(): if not has_metric_category: continue + + # Pre-evaluate all the literal strings once + predictions = [ast.literal_eval(p) for p in dataset["predictions"][:num_samples]] + input_tokens = [ast.literal_eval(t) for t in dataset["input_tokens"][:num_samples]] + cont_tokens = [ast.literal_eval(t) for t in dataset["cont_tokens"][:num_samples]] + truncated = [ast.literal_eval(t)[0] for t in dataset["truncated"][:num_samples]] + padded = [ast.literal_eval(p)[0] for p in dataset["padded"][:num_samples]] + + if model_response_type == GenerativeResponse: + logits = [ast.literal_eval(p) for p in dataset["pred_logits"][:num_samples]] + for idx in range(num_samples): kwargs = { - "result": ast.literal_eval(dataset["predictions"][idx]), - "input_tokens": ast.literal_eval(dataset["input_tokens"][idx]), - "generated_tokens": ast.literal_eval(dataset["cont_tokens"][idx]), - "truncated_tokens_count": ast.literal_eval(dataset["truncated"][idx])[0], - "padded_tokens_count": ast.literal_eval(dataset["padded"][idx])[0], + "result": predictions[idx], + "input_tokens": input_tokens[idx], + "generated_tokens": cont_tokens[idx], + "truncated_tokens_count": truncated[idx], + "padded_tokens_count": padded[idx], } if model_response_type == GenerativeResponse: - kwargs["logits"] = ast.literal_eval(dataset["pred_logits"][idx]) + kwargs["logits"] = logits[idx] + response = model_response_type(**kwargs) sample_id_to_responses[(SampleUid(task_name, f"{idx}_{0}"), metric_category)] = [response] return sample_id_to_responses From eaedd04c5aae4722f8692e64489ebb90db3b0850 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Fri, 10 Jan 2025 20:30:55 -0800 Subject: [PATCH 4/8] Fixed typo. --- src/lighteval/logging/evaluation_tracker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 37c6ab480..0fe638c30 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -242,7 +242,7 @@ def load_details_datasets(self, date_id: str, task_names: list[str]) -> dict[str for task_name in task_names: if not any(task_name.startswith(task_name) for task_name in details_datasets.keys()): raise ValueError( - f"Task {task_name} not found in details datasets. Check the tasks to be evaluated or the date_id used to load the details ({self.pipeline_parameters.load_responses_from_details_date_id})." + f"Task {task_name} not found in details datasets. Check the tasks to be evaluated or the date_id used to load the details ({date_id})." ) return details_datasets From ca8331a97879b18d4f49e5030abf9ae2731a3695 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Sat, 11 Jan 2025 03:36:46 -0800 Subject: [PATCH 5/8] Fixed gnarly bug with details loading to prevent loading too many examples. --- src/lighteval/pipeline.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index f023cd35d..3b4764530 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -297,27 +297,27 @@ def _load_responses_from_details(self): for task_name, dataset in tqdm(details_datasets.items(), desc="Loading responses from details for tasks"): task: LightevalTask = self._get_task(task_name) - num_samples = len(dataset["predictions"]) + num_samples = len(set(dataset["specifics"])) max_samples = self.pipeline_parameters.max_samples if self.pipeline_parameters.max_samples else num_samples if num_samples > max_samples: logger.warning( f"Skipping {num_samples - max_samples} samples for {task_name} when loading responses from details because max_samples is set to {max_samples}" ) num_samples = self.pipeline_parameters.max_samples + + predictions = [ast.literal_eval(p) for p in dataset["predictions"][:num_samples]] + input_tokens = [ast.literal_eval(t) for t in dataset["input_tokens"][:num_samples]] + cont_tokens = [ast.literal_eval(t) for t in dataset["cont_tokens"][:num_samples]] + truncated = [ast.literal_eval(t)[0] for t in dataset["truncated"][:num_samples]] + padded = [ast.literal_eval(p)[0] for p in dataset["padded"][:num_samples]] + + if model_response_type == GenerativeResponse: + logits = [ast.literal_eval(p) for p in dataset["pred_logits"][:num_samples]] + for metric_category, has_metric_category in task.has_metric_category.items(): if not has_metric_category: continue - # Pre-evaluate all the literal strings once - predictions = [ast.literal_eval(p) for p in dataset["predictions"][:num_samples]] - input_tokens = [ast.literal_eval(t) for t in dataset["input_tokens"][:num_samples]] - cont_tokens = [ast.literal_eval(t) for t in dataset["cont_tokens"][:num_samples]] - truncated = [ast.literal_eval(t)[0] for t in dataset["truncated"][:num_samples]] - padded = [ast.literal_eval(p)[0] for p in dataset["padded"][:num_samples]] - - if model_response_type == GenerativeResponse: - logits = [ast.literal_eval(p) for p in dataset["pred_logits"][:num_samples]] - for idx in range(num_samples): kwargs = { "result": predictions[idx], From 3a22d93dc299c17d02fa3930148ef960fe3180e9 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Mon, 13 Jan 2025 11:01:47 -0800 Subject: [PATCH 6/8] Unpacking predictions to fix issue with weirdly saved predictions. --- src/lighteval/pipeline.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index 3b4764530..a114a5d4b 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -280,6 +280,14 @@ def evaluate(self): except OSError: pass + def _unpack(self, x): + if isinstance(x, str): + return x + elif isinstance(x, (list, tuple)): + return self._unpack(x[0]) + else: + raise ValueError(f"Unknown type {type(x)} of prediction {x}") + def _load_responses_from_details(self): logger.info("--- LOADING RESPONSES FROM DETAILS ---") sample_id_to_responses: dict[(SampleUid, MetricCategory), list[ModelResponse]] = collections.defaultdict(list) @@ -305,7 +313,7 @@ def _load_responses_from_details(self): ) num_samples = self.pipeline_parameters.max_samples - predictions = [ast.literal_eval(p) for p in dataset["predictions"][:num_samples]] + predictions = [self._unpack(ast.literal_eval(p)) for p in dataset["predictions"][:num_samples]] input_tokens = [ast.literal_eval(t) for t in dataset["input_tokens"][:num_samples]] cont_tokens = [ast.literal_eval(t) for t in dataset["cont_tokens"][:num_samples]] truncated = [ast.literal_eval(t)[0] for t in dataset["truncated"][:num_samples]] From dae2d2bc31a7c0ef1a1087b5267fe8a444d99617 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Mon, 13 Jan 2025 11:30:58 -0800 Subject: [PATCH 7/8] Made bulk loading easier by also allowing first timestamp more generally. --- src/lighteval/logging/evaluation_tracker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 0fe638c30..7126d03b8 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -211,7 +211,7 @@ def save_results(self, date_id: str, results_dict: dict): def _get_details_sub_folder(self, date_id: str): output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name - if date_id == "latest": + if date_id in ["first", "last"]: # Get all folders in output_dir_details if not self.fs.exists(output_dir_details): raise FileNotFoundError(f"Details directory {output_dir_details} does not exist") @@ -222,8 +222,8 @@ def _get_details_sub_folder(self, date_id: str): if not folders: raise FileNotFoundError(f"No timestamp folders found in {output_dir_details}") - # Parse timestamps and get latest - date_id = max(folders) + # Parse timestamps and get first or last + date_id = max(folders) if date_id == "last" else min(folders) return output_dir_details / date_id def load_details_datasets(self, date_id: str, task_names: list[str]) -> dict[str, Dataset]: From 299b90ce931b840a9fcee59d73fbd681162336b7 Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Mon, 13 Jan 2025 17:10:32 -0800 Subject: [PATCH 8/8] Made loading details more robust against tensors being saved in the details files. --- src/lighteval/pipeline.py | 78 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 2 deletions(-) diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py index a114a5d4b..0e6282ef5 100644 --- a/src/lighteval/pipeline.py +++ b/src/lighteval/pipeline.py @@ -24,6 +24,7 @@ import collections import os import random +import re import shutil from contextlib import nullcontext from dataclasses import dataclass, field @@ -288,6 +289,79 @@ def _unpack(self, x): else: raise ValueError(f"Unknown type {type(x)} of prediction {x}") + def _parse_tensor_string(self, tensor_string): + """ + Convert a string containing PyTorch-like `tensor([...], device='cuda:0', ...)` + into a Python list (or nested lists) of numbers. + + Example: + "[tensor([1, 2, 3], device='cuda:0'), tensor([[4,5],[6,7]], dtype=torch.int64)]" + -> [[1, 2, 3], [[4, 5], [6, 7]]] + """ + + # Regex explanation: + # - tensor\(\s*: Matches "tensor(" (possibly with spaces after), literally. + # - (.*?): Captures everything lazily into group(1), until the first subsequent part matches. + # We rely on the next pattern to anchor the end of this capture. + # - \): The literal closing parenthesis, but we anchor the match by ignoring + # further arguments (device=..., dtype=..., etc.) inside. + # + # The tricky part: a tensor might look like + # tensor([ ... ], device='cuda:0', dtype=torch.int64) + # so the bracket portion is `[ ... ]`, but it can have newlines, etc. + # + # We'll handle that by first capturing the entire content up to the final parenthesis, + # then parse out the bracket portion. This can be done in a function-based re.sub. + + pattern = re.compile( + r"tensor\s*\(\s*(.*?)\s*\)", # capture everything inside tensor(...) + flags=re.DOTALL, + ) + + def tensor_replacer(match): + inside = match.group(1).strip() + # `inside` might look like: [1, 2, 3], device='cuda:0' + # or: + # [ + # 1, 2, 3, + # 4, 5, ... + # ], device='cuda:0', dtype=torch.int64 + # + # 1) Extract the bracketed array portion: the first [ ... ] block + # which might be multi-line. We'll use another regex for that. + + # We look for the bracketed portion from the first '[' to its matching ']'. + # Because the inside can be multi-line, we use DOTALL. But we still need + # to ensure we don't accidentally go beyond the matching bracket. + # + # A robust approach to properly match brackets can be done with a small parser, + # but for typical well-formed strings, a lazy match of the form + # r"\[.*?\]" DOTALL often suffices, assuming no nested brackets inside. + + bracket_pattern = re.compile(r"\[.*?\]", re.DOTALL) + bracket_match = bracket_pattern.search(inside) + if not bracket_match: + # If we fail to find a bracket, just return something safe. + # This means the string didn't match the expected format. + return "[]" + + # The bracketed portion (e.g. "[1, 2, 3\n, 4]"). + bracketed_content = bracket_match.group(0) + + # Return just the bracketed content, + # effectively replacing "tensor(...)" with "[...]". + return bracketed_content + + # Step 1: Replace every `tensor(...)` occurrence with just the bracketed list. + processed = pattern.sub(tensor_replacer, tensor_string) + + # Step 2: Now we can safely parse the result with literal_eval. + # If there's still something weird, it may throw ValueError. + try: + return ast.literal_eval(processed) + except Exception as e: + raise ValueError(f"Failed to parse after preprocessing. " f"Processed string:\n{processed}\n\nError: {e}") + def _load_responses_from_details(self): logger.info("--- LOADING RESPONSES FROM DETAILS ---") sample_id_to_responses: dict[(SampleUid, MetricCategory), list[ModelResponse]] = collections.defaultdict(list) @@ -314,8 +388,8 @@ def _load_responses_from_details(self): num_samples = self.pipeline_parameters.max_samples predictions = [self._unpack(ast.literal_eval(p)) for p in dataset["predictions"][:num_samples]] - input_tokens = [ast.literal_eval(t) for t in dataset["input_tokens"][:num_samples]] - cont_tokens = [ast.literal_eval(t) for t in dataset["cont_tokens"][:num_samples]] + input_tokens = [self._parse_tensor_string(t) for t in dataset["input_tokens"][:num_samples]] + cont_tokens = [self._parse_tensor_string(t) for t in dataset["cont_tokens"][:num_samples]] truncated = [ast.literal_eval(t)[0] for t in dataset["truncated"][:num_samples]] padded = [ast.literal_eval(p)[0] for p in dataset["padded"][:num_samples]]