huggingface · ilyesdjerfaf · May 17, 2024 · Jun 5, 2024
diff --git a/src/evaluate/evaluator/base.py b/src/evaluate/evaluator/base.py
@@ -14,7 +14,7 @@
 
 from abc import ABC, abstractmethod
 from numbers import Number
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
 
 # Lint as: python3
 from datasets import Dataset, load_dataset
@@ -477,36 +477,61 @@ def prepare_pipeline(
             )
         return pipe
 
-    def prepare_metric(self, metric: Union[str, EvaluationModule]):
+    def prepare_metric(
+        self,
+        metric: Union[str,
+                      EvaluationModule,
+                      List[str],
+                      List[EvaluationModule]
+                      ],
+        metrics_kwargs: Optional[Dict[str, Union[Dict, List]]] = None
+    ) -> List[Tuple[EvaluationModule, Dict[str, Any]]]:
         """
         Prepare metric.
-
         Args:
-            metric (`str` or [`EvaluationModule`], defaults to `None`):
-                Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
+            metric (`str` or `EvaluationModule` or `List[str]`
+                    or `List[EvaluationModule]`):
+                Specifies the metric(s) we use in evaluator.
+                If it is of type `str`, we treat it as the metric name, and
                 load it. Otherwise we assume it represents a pre-loaded metric.
-
         Returns:
-            The loaded metric.
-
+            The list of loaded metrics with their respective kwargs.
         Example:
-
         ```py
         >>> from evaluate import evaluator
         >>> evaluator("text-classification").prepare_metric("accuracy")
         ```
         """
+
         # Prepare metric.
         if metric is None:
             if self.default_metric_name is None:
                 raise ValueError(
-                    "`Evaluator` doesn't specify a default metric. Please specify a valid `metric` argument."
+                    "`Evaluator` doesn't specify a default metric. "
+                    "Please specify a valid `metric` argument."
                 )
             metric = load(self.default_metric_name)
-        elif isinstance(metric, str):
-            metric = load(metric)
-
-        return metric
+        elif not isinstance(metric, list):
+            em = load(metric) if isinstance(metric, str) else metric
+            if metrics_kwargs and metric in metrics_kwargs:
+                if isinstance(metrics_kwargs[metric], dict):
+                    return [(em, metrics_kwargs[metric])]
+                elif isinstance(metrics_kwargs[metric], list):
+                    return [(em, m_) for m_ in metrics_kwargs[metric]]
+            return [(em, {})]
+        else:
+            metric_list = []
+            for m in metric:
+                em = load(m) if isinstance(m, str) else m
+                if metrics_kwargs and m in metrics_kwargs:
+                    if isinstance(metrics_kwargs[m], dict):
+                        metric_list.append((em, metrics_kwargs[m]))
+                    elif isinstance(metrics_kwargs[m], list):
+                        metric_list.extend([(em, m_)
+                                            for m_ in metrics_kwargs[m]])
+                else:
+                    metric_list.append((m, {}))
+            return metric_list
 
     def call_pipeline(self, pipe, *args, **kwargs):
         start_time = perf_counter()
@@ -516,16 +541,38 @@ def call_pipeline(self, pipe, *args, **kwargs):
 
     def compute_metric(
         self,
-        metric: EvaluationModule,
+        metric: Union[List[Tuple[EvaluationModule, Dict[str, Any]]],
+                      EvaluationModule],
         metric_inputs: Dict,
         strategy: Literal["simple", "bootstrap"] = "simple",
         confidence_level: float = 0.95,
         n_resamples: int = 9999,
         random_state: Optional[int] = None,
-    ):
+        metrics_kwargs: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
         """Compute and return metrics."""
-        result = metric.compute(**metric_inputs, **self.METRIC_KWARGS)
-
+        if isinstance(metric, list):
+            if strategy == "bootstrap":
+                raise ValueError("Bootstrap strategy is not supported "
+                                 "with multiple metrics.")
+            result = {}
+            for m, kwarg in metric:
+                result_m = self.compute_metric(m,
+                                               metric_inputs,
+                                               strategy,
+                                               confidence_level,
+                                               n_resamples,
+                                               random_state,
+                                               kwarg)
+                _values_str = "_".join([str(v) for v in kwarg.values()])
+                result.update({f"{m.name}_{_values_str}":
+                               list(result_m.values())})
+            return result
+
+        result = metric.compute(
+            **metric_inputs,
+            **metrics_kwargs
+        )
         if strategy == "bootstrap":
             metric_keys = result.keys()
             bootstrap_dict = self._compute_confidence_interval(
@@ -538,7 +585,5 @@ def compute_metric(
             )
             for key in metric_keys:
                 bootstrap_dict[key]["score"] = result[key]
-
             return bootstrap_dict
-
         return result
diff --git a/src/evaluate/evaluator/text_classification.py b/src/evaluate/evaluator/text_classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 from datasets import Dataset, load_dataset
 from typing_extensions import Literal
@@ -91,14 +91,21 @@ def predictions_processor(self, predictions, label_mapping):
     def compute(
         self,
         model_or_pipeline: Union[
-            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+            str, "Pipeline", Callable, "PreTrainedModel",  # noqa: F821
+            "TFPreTrainedModel"
         ] = None,
         data: Union[str, Dataset] = None,
         subset: Optional[str] = None,
         split: Optional[str] = None,
-        metric: Union[str, EvaluationModule] = None,
-        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
-        feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None,  # noqa: F821
+        metric: Union[str,
+                      EvaluationModule,
+                      List[str],
+                      List[EvaluationModule]
+                      ] = None,
+        tokenizer: Optional[Union[str,  # noqa: F821
+                                  "PreTrainedTokenizer"]] = None,
+        feature_extractor: Optional[Union[str,  # noqa: F821
+                                          "FeatureExtractionMixin"]] = None,
         strategy: Literal["simple", "bootstrap"] = "simple",
         confidence_level: float = 0.95,
         n_resamples: int = 9999,
@@ -108,52 +115,54 @@ def compute(
         second_input_column: Optional[str] = None,
         label_column: str = "label",
         label_mapping: Optional[Dict[str, Number]] = None,
+        metrics_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[Dict[str, float], Any]:
         """
         input_column (`str`, *optional*, defaults to `"text"`):
-            The name of the column containing the text feature in the dataset specified by `data`.
+            The name of the column containing the text feature
+            in the dataset specified by `data`.
         second_input_column (`str`, *optional*, defaults to `None`):
-            The name of the second column containing the text features. This may be useful for classification tasks
+            The name of the second column containing the text features.
+            This may be useful for classification tasks
             as MNLI, where two columns are used.
         label_column (`str`, defaults to `"label"`):
-            The name of the column containing the labels in the dataset specified by `data`.
+            The name of the column containing the labels in the dataset
+            specified by `data`.
         label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`):
-            We want to map class labels defined by the model in the pipeline to values consistent with those
+            We want to map class labels defined by the model
+            in the pipeline to values consistent with those
             defined in the `label_column` of the `data` dataset.
+        metrics_kwargs (`Dict[str, Any]`, *optional*, defaults to `None`):
+            Additional keyword to pass to the metric(s).
         """
-
         result = {}
-
         self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
-
         # Prepare inputs
         data = self.load_data(data=data, subset=subset, split=split)
         metric_inputs, pipe_inputs = self.prepare_data(
-            data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column
+            data=data, input_column=input_column,
+            second_input_column=second_input_column, label_column=label_column
         )
         pipe = self.prepare_pipeline(
             model_or_pipeline=model_or_pipeline,
             tokenizer=tokenizer,
             feature_extractor=feature_extractor,
             device=device,
         )
-        metric = self.prepare_metric(metric)
-
+        metric = self.prepare_metric(metric, metrics_kwargs)
         # Compute predictions
         predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
         predictions = self.predictions_processor(predictions, label_mapping)
         metric_inputs.update(predictions)
-
         # Compute metrics from references and predictions
         metric_results = self.compute_metric(
             metric=metric,
             metric_inputs=metric_inputs,
             strategy=strategy,
             confidence_level=confidence_level,
             n_resamples=n_resamples,
-            random_state=random_state,
+            random_state=random_state
         )
-
         result.update(metric_results)
         result.update(perf_results)