From 97056b395854e2993f292e7d1d0d03f4e5df98e6 Mon Sep 17 00:00:00 2001 From: ilanaliouchouche Date: Fri, 17 May 2024 20:58:32 +0200 Subject: [PATCH 1/2] textclassification evaluator now supports averaged metrics (such as macro micro ...) --- src/evaluate/evaluator/base.py | 79 ++++++++++++++----- src/evaluate/evaluator/text_classification.py | 42 +++++----- 2 files changed, 84 insertions(+), 37 deletions(-) diff --git a/src/evaluate/evaluator/base.py b/src/evaluate/evaluator/base.py index 09de31f19..ec4bc0392 100644 --- a/src/evaluate/evaluator/base.py +++ b/src/evaluate/evaluator/base.py @@ -14,7 +14,7 @@ from abc import ABC, abstractmethod from numbers import Number -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union, Tuple # Lint as: python3 from datasets import Dataset, load_dataset @@ -477,20 +477,23 @@ def prepare_pipeline( ) return pipe - def prepare_metric(self, metric: Union[str, EvaluationModule]): + def prepare_metric( + self, + metric: Union[str, EvaluationModule, + List[str], List[EvaluationModule]], + metrics_kwargs: Optional[Dict[str, Union[Dict, List]]] = None + ) -> List[Tuple[EvaluationModule, Dict[str, Any]]]: """ Prepare metric. - Args: - metric (`str` or [`EvaluationModule`], defaults to `None`): - Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and + metric (`str` or `EvaluationModule` or `List[str]` + or `List[EvaluationModule]`): + Specifies the metric(s) we use in evaluator. + If it is of type `str`, we treat it as the metric name, and load it. Otherwise we assume it represents a pre-loaded metric. - Returns: The loaded metric. - Example: - ```py >>> from evaluate import evaluator >>> evaluator("text-classification").prepare_metric("accuracy") @@ -500,13 +503,31 @@ def prepare_metric(self, metric: Union[str, EvaluationModule]): if metric is None: if self.default_metric_name is None: raise ValueError( - "`Evaluator` doesn't specify a default metric. Please specify a valid `metric` argument." + "`Evaluator` doesn't specify a default metric. " + "Please specify a valid `metric` argument." ) metric = load(self.default_metric_name) - elif isinstance(metric, str): - metric = load(metric) - - return metric + elif isinstance(metric, str) or isinstance(metric, EvaluationModule): + em = load(metric) if isinstance(metric, str) else metric + if metrics_kwargs and metric in metrics_kwargs: + if isinstance(metrics_kwargs[metric], dict): + return [(em, metrics_kwargs[metric])] + elif isinstance(metrics_kwargs[metric], list): + return [(em, m_) for m_ in metrics_kwargs[metric]] + return [(em, {})] + else: + metric_ = [] + for m in metric: + em = load(m) if isinstance(m, str) else m + if metrics_kwargs and m in metrics_kwargs: + if isinstance(metrics_kwargs[m], dict): + metric_.append((em, metrics_kwargs[m])) + elif isinstance(metrics_kwargs[m], list): + metric_.extend([(em, m_) + for m_ in metrics_kwargs[m]]) + else: + metric_.append((m, {})) + return metric_ def call_pipeline(self, pipe, *args, **kwargs): start_time = perf_counter() @@ -516,16 +537,38 @@ def call_pipeline(self, pipe, *args, **kwargs): def compute_metric( self, - metric: EvaluationModule, + metric: Union[List[Tuple[EvaluationModule, Dict[str, Any]]], + EvaluationModule], metric_inputs: Dict, strategy: Literal["simple", "bootstrap"] = "simple", confidence_level: float = 0.95, n_resamples: int = 9999, random_state: Optional[int] = None, - ): + metrics_kwargs: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: """Compute and return metrics.""" - result = metric.compute(**metric_inputs, **self.METRIC_KWARGS) - + if isinstance(metric, list): + if strategy == "bootstrap": + raise ValueError("Bootstrap strategy is not supported " + "with multiple metrics.") + result = {} + for m, kwarg in metric: + result_m = self.compute_metric(m, + metric_inputs, + strategy, + confidence_level, + n_resamples, + random_state, + kwarg) + _values_str = "_".join([str(v) for v in kwarg.values()]) + result.update({f"{m.name}_{_values_str}": + list(result_m.values())}) + return result + + result = metric.compute( + **metric_inputs, + **metrics_kwargs + ) if strategy == "bootstrap": metric_keys = result.keys() bootstrap_dict = self._compute_confidence_interval( @@ -538,7 +581,5 @@ def compute_metric( ) for key in metric_keys: bootstrap_dict[key]["score"] = result[key] - return bootstrap_dict - return result diff --git a/src/evaluate/evaluator/text_classification.py b/src/evaluate/evaluator/text_classification.py index 200eb01d7..a1513aee8 100644 --- a/src/evaluate/evaluator/text_classification.py +++ b/src/evaluate/evaluator/text_classification.py @@ -13,7 +13,7 @@ # limitations under the License. from numbers import Number -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union, List from datasets import Dataset, load_dataset from typing_extensions import Literal @@ -91,14 +91,18 @@ def predictions_processor(self, predictions, label_mapping): def compute( self, model_or_pipeline: Union[ - str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821 + str, "Pipeline", Callable, "PreTrainedModel", # noqa: F821 + "TFPreTrainedModel" ] = None, data: Union[str, Dataset] = None, subset: Optional[str] = None, split: Optional[str] = None, - metric: Union[str, EvaluationModule] = None, - tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821 - feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None, # noqa: F821 + metric: Union[str, EvaluationModule, + List[str], List[EvaluationModule]] = None, + tokenizer: Optional[Union[str, # noqa: F821 + "PreTrainedTokenizer"]] = None, + feature_extractor: Optional[Union[str, # noqa: F821 + "FeatureExtractionMixin"]] = None, strategy: Literal["simple", "bootstrap"] = "simple", confidence_level: float = 0.95, n_resamples: int = 9999, @@ -108,28 +112,33 @@ def compute( second_input_column: Optional[str] = None, label_column: str = "label", label_mapping: Optional[Dict[str, Number]] = None, + metrics_kwargs: Optional[Dict[str, Any]] = None, ) -> Tuple[Dict[str, float], Any]: """ input_column (`str`, *optional*, defaults to `"text"`): - The name of the column containing the text feature in the dataset specified by `data`. + The name of the column containing the text feature + in the dataset specified by `data`. second_input_column (`str`, *optional*, defaults to `None`): - The name of the second column containing the text features. This may be useful for classification tasks + The name of the second column containing the text features. + This may be useful for classification tasks as MNLI, where two columns are used. label_column (`str`, defaults to `"label"`): - The name of the column containing the labels in the dataset specified by `data`. + The name of the column containing the labels in the dataset + specified by `data`. label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`): - We want to map class labels defined by the model in the pipeline to values consistent with those + We want to map class labels defined by the model + in the pipeline to values consistent with those defined in the `label_column` of the `data` dataset. + metrics_kwargs (`Dict[str, Any]`, *optional*, defaults to `None`): + Additional keyword to pass to the metric(s). """ - result = {} - self.check_for_mismatch_in_device_setup(device, model_or_pipeline) - # Prepare inputs data = self.load_data(data=data, subset=subset, split=split) metric_inputs, pipe_inputs = self.prepare_data( - data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column + data=data, input_column=input_column, + second_input_column=second_input_column, label_column=label_column ) pipe = self.prepare_pipeline( model_or_pipeline=model_or_pipeline, @@ -137,13 +146,11 @@ def compute( feature_extractor=feature_extractor, device=device, ) - metric = self.prepare_metric(metric) - + metric = self.prepare_metric(metric, metrics_kwargs) # Compute predictions predictions, perf_results = self.call_pipeline(pipe, pipe_inputs) predictions = self.predictions_processor(predictions, label_mapping) metric_inputs.update(predictions) - # Compute metrics from references and predictions metric_results = self.compute_metric( metric=metric, @@ -151,9 +158,8 @@ def compute( strategy=strategy, confidence_level=confidence_level, n_resamples=n_resamples, - random_state=random_state, + random_state=random_state ) - result.update(metric_results) result.update(perf_results) From 809cca154a466e37c0cb4f2833ad3d92bb418f5c Mon Sep 17 00:00:00 2001 From: Ilyes Djerfaf <87201310+idjerfaf@users.noreply.github.com> Date: Wed, 5 Jun 2024 20:46:44 +0200 Subject: [PATCH 2/2] Refactoring and Review Co-authored-by: Ilan Aliouchouche --- src/evaluate/evaluator/base.py | 24 +++++++++++-------- src/evaluate/evaluator/text_classification.py | 9 ++++--- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/evaluate/evaluator/base.py b/src/evaluate/evaluator/base.py index ec4bc0392..9c2cafe80 100644 --- a/src/evaluate/evaluator/base.py +++ b/src/evaluate/evaluator/base.py @@ -479,8 +479,11 @@ def prepare_pipeline( def prepare_metric( self, - metric: Union[str, EvaluationModule, - List[str], List[EvaluationModule]], + metric: Union[str, + EvaluationModule, + List[str], + List[EvaluationModule] + ], metrics_kwargs: Optional[Dict[str, Union[Dict, List]]] = None ) -> List[Tuple[EvaluationModule, Dict[str, Any]]]: """ @@ -492,13 +495,14 @@ def prepare_metric( If it is of type `str`, we treat it as the metric name, and load it. Otherwise we assume it represents a pre-loaded metric. Returns: - The loaded metric. + The list of loaded metrics with their respective kwargs. Example: ```py >>> from evaluate import evaluator >>> evaluator("text-classification").prepare_metric("accuracy") ``` """ + # Prepare metric. if metric is None: if self.default_metric_name is None: @@ -507,7 +511,7 @@ def prepare_metric( "Please specify a valid `metric` argument." ) metric = load(self.default_metric_name) - elif isinstance(metric, str) or isinstance(metric, EvaluationModule): + elif not isinstance(metric, list): em = load(metric) if isinstance(metric, str) else metric if metrics_kwargs and metric in metrics_kwargs: if isinstance(metrics_kwargs[metric], dict): @@ -516,18 +520,18 @@ def prepare_metric( return [(em, m_) for m_ in metrics_kwargs[metric]] return [(em, {})] else: - metric_ = [] + metric_list = [] for m in metric: em = load(m) if isinstance(m, str) else m if metrics_kwargs and m in metrics_kwargs: if isinstance(metrics_kwargs[m], dict): - metric_.append((em, metrics_kwargs[m])) + metric_list.append((em, metrics_kwargs[m])) elif isinstance(metrics_kwargs[m], list): - metric_.extend([(em, m_) - for m_ in metrics_kwargs[m]]) + metric_list.extend([(em, m_) + for m_ in metrics_kwargs[m]]) else: - metric_.append((m, {})) - return metric_ + metric_list.append((m, {})) + return metric_list def call_pipeline(self, pipe, *args, **kwargs): start_time = perf_counter() diff --git a/src/evaluate/evaluator/text_classification.py b/src/evaluate/evaluator/text_classification.py index a1513aee8..4db412dfe 100644 --- a/src/evaluate/evaluator/text_classification.py +++ b/src/evaluate/evaluator/text_classification.py @@ -13,7 +13,7 @@ # limitations under the License. from numbers import Number -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union, List +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union from datasets import Dataset, load_dataset from typing_extensions import Literal @@ -97,8 +97,11 @@ def compute( data: Union[str, Dataset] = None, subset: Optional[str] = None, split: Optional[str] = None, - metric: Union[str, EvaluationModule, - List[str], List[EvaluationModule]] = None, + metric: Union[str, + EvaluationModule, + List[str], + List[EvaluationModule] + ] = None, tokenizer: Optional[Union[str, # noqa: F821 "PreTrainedTokenizer"]] = None, feature_extractor: Optional[Union[str, # noqa: F821