From 97056b395854e2993f292e7d1d0d03f4e5df98e6 Mon Sep 17 00:00:00 2001
From: ilanaliouchouche <ilan.aliouchouche@universite-paris-saclay.fr>
Date: Fri, 17 May 2024 20:58:32 +0200
Subject: [PATCH 1/2] textclassification evaluator now supports averaged
 metrics (such as macro micro ...)

---
 src/evaluate/evaluator/base.py                | 79 ++++++++++++++-----
 src/evaluate/evaluator/text_classification.py | 42 +++++-----
 2 files changed, 84 insertions(+), 37 deletions(-)

diff --git a/src/evaluate/evaluator/base.py b/src/evaluate/evaluator/base.py
index 09de31f19..ec4bc0392 100644
--- a/src/evaluate/evaluator/base.py
+++ b/src/evaluate/evaluator/base.py
@@ -14,7 +14,7 @@
 
 from abc import ABC, abstractmethod
 from numbers import Number
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
 
 # Lint as: python3
 from datasets import Dataset, load_dataset
@@ -477,20 +477,23 @@ def prepare_pipeline(
             )
         return pipe
 
-    def prepare_metric(self, metric: Union[str, EvaluationModule]):
+    def prepare_metric(
+        self,
+        metric: Union[str, EvaluationModule,
+                      List[str], List[EvaluationModule]],
+        metrics_kwargs: Optional[Dict[str, Union[Dict, List]]] = None
+    ) -> List[Tuple[EvaluationModule, Dict[str, Any]]]:
         """
         Prepare metric.
-
         Args:
-            metric (`str` or [`EvaluationModule`], defaults to `None`):
-                Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
+            metric (`str` or `EvaluationModule` or `List[str]`
+                    or `List[EvaluationModule]`):
+                Specifies the metric(s) we use in evaluator.
+                If it is of type `str`, we treat it as the metric name, and
                 load it. Otherwise we assume it represents a pre-loaded metric.
-
         Returns:
             The loaded metric.
-
         Example:
-
         ```py
         >>> from evaluate import evaluator
         >>> evaluator("text-classification").prepare_metric("accuracy")
@@ -500,13 +503,31 @@ def prepare_metric(self, metric: Union[str, EvaluationModule]):
         if metric is None:
             if self.default_metric_name is None:
                 raise ValueError(
-                    "`Evaluator` doesn't specify a default metric. Please specify a valid `metric` argument."
+                    "`Evaluator` doesn't specify a default metric. "
+                    "Please specify a valid `metric` argument."
                 )
             metric = load(self.default_metric_name)
-        elif isinstance(metric, str):
-            metric = load(metric)
-
-        return metric
+        elif isinstance(metric, str) or isinstance(metric, EvaluationModule):
+            em = load(metric) if isinstance(metric, str) else metric
+            if metrics_kwargs and metric in metrics_kwargs:
+                if isinstance(metrics_kwargs[metric], dict):
+                    return [(em, metrics_kwargs[metric])]
+                elif isinstance(metrics_kwargs[metric], list):
+                    return [(em, m_) for m_ in metrics_kwargs[metric]]
+            return [(em, {})]
+        else:
+            metric_ = []
+            for m in metric:
+                em = load(m) if isinstance(m, str) else m
+                if metrics_kwargs and m in metrics_kwargs:
+                    if isinstance(metrics_kwargs[m], dict):
+                        metric_.append((em, metrics_kwargs[m]))
+                    elif isinstance(metrics_kwargs[m], list):
+                        metric_.extend([(em, m_)
+                                        for m_ in metrics_kwargs[m]])
+                else:
+                    metric_.append((m, {}))
+            return metric_
 
     def call_pipeline(self, pipe, *args, **kwargs):
         start_time = perf_counter()
@@ -516,16 +537,38 @@ def call_pipeline(self, pipe, *args, **kwargs):
 
     def compute_metric(
         self,
-        metric: EvaluationModule,
+        metric: Union[List[Tuple[EvaluationModule, Dict[str, Any]]],
+                      EvaluationModule],
         metric_inputs: Dict,
         strategy: Literal["simple", "bootstrap"] = "simple",
         confidence_level: float = 0.95,
         n_resamples: int = 9999,
         random_state: Optional[int] = None,
-    ):
+        metrics_kwargs: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
         """Compute and return metrics."""
-        result = metric.compute(**metric_inputs, **self.METRIC_KWARGS)
-
+        if isinstance(metric, list):
+            if strategy == "bootstrap":
+                raise ValueError("Bootstrap strategy is not supported "
+                                 "with multiple metrics.")
+            result = {}
+            for m, kwarg in metric:
+                result_m = self.compute_metric(m,
+                                               metric_inputs,
+                                               strategy,
+                                               confidence_level,
+                                               n_resamples,
+                                               random_state,
+                                               kwarg)
+                _values_str = "_".join([str(v) for v in kwarg.values()])
+                result.update({f"{m.name}_{_values_str}":
+                               list(result_m.values())})
+            return result
+
+        result = metric.compute(
+            **metric_inputs,
+            **metrics_kwargs
+        )
         if strategy == "bootstrap":
             metric_keys = result.keys()
             bootstrap_dict = self._compute_confidence_interval(
@@ -538,7 +581,5 @@ def compute_metric(
             )
             for key in metric_keys:
                 bootstrap_dict[key]["score"] = result[key]
-
             return bootstrap_dict
-
         return result
diff --git a/src/evaluate/evaluator/text_classification.py b/src/evaluate/evaluator/text_classification.py
index 200eb01d7..a1513aee8 100644
--- a/src/evaluate/evaluator/text_classification.py
+++ b/src/evaluate/evaluator/text_classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union, List
 
 from datasets import Dataset, load_dataset
 from typing_extensions import Literal
@@ -91,14 +91,18 @@ def predictions_processor(self, predictions, label_mapping):
     def compute(
         self,
         model_or_pipeline: Union[
-            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+            str, "Pipeline", Callable, "PreTrainedModel",  # noqa: F821
+            "TFPreTrainedModel"
         ] = None,
         data: Union[str, Dataset] = None,
         subset: Optional[str] = None,
         split: Optional[str] = None,
-        metric: Union[str, EvaluationModule] = None,
-        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
-        feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None,  # noqa: F821
+        metric: Union[str, EvaluationModule,
+                      List[str], List[EvaluationModule]] = None,
+        tokenizer: Optional[Union[str,  # noqa: F821
+                                  "PreTrainedTokenizer"]] = None,
+        feature_extractor: Optional[Union[str,  # noqa: F821
+                                          "FeatureExtractionMixin"]] = None,
         strategy: Literal["simple", "bootstrap"] = "simple",
         confidence_level: float = 0.95,
         n_resamples: int = 9999,
@@ -108,28 +112,33 @@ def compute(
         second_input_column: Optional[str] = None,
         label_column: str = "label",
         label_mapping: Optional[Dict[str, Number]] = None,
+        metrics_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Tuple[Dict[str, float], Any]:
         """
         input_column (`str`, *optional*, defaults to `"text"`):
-            The name of the column containing the text feature in the dataset specified by `data`.
+            The name of the column containing the text feature
+            in the dataset specified by `data`.
         second_input_column (`str`, *optional*, defaults to `None`):
-            The name of the second column containing the text features. This may be useful for classification tasks
+            The name of the second column containing the text features.
+            This may be useful for classification tasks
             as MNLI, where two columns are used.
         label_column (`str`, defaults to `"label"`):
-            The name of the column containing the labels in the dataset specified by `data`.
+            The name of the column containing the labels in the dataset
+            specified by `data`.
         label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`):
-            We want to map class labels defined by the model in the pipeline to values consistent with those
+            We want to map class labels defined by the model
+            in the pipeline to values consistent with those
             defined in the `label_column` of the `data` dataset.
+        metrics_kwargs (`Dict[str, Any]`, *optional*, defaults to `None`):
+            Additional keyword to pass to the metric(s).
         """
-
         result = {}
-
         self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
-
         # Prepare inputs
         data = self.load_data(data=data, subset=subset, split=split)
         metric_inputs, pipe_inputs = self.prepare_data(
-            data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column
+            data=data, input_column=input_column,
+            second_input_column=second_input_column, label_column=label_column
         )
         pipe = self.prepare_pipeline(
             model_or_pipeline=model_or_pipeline,
@@ -137,13 +146,11 @@ def compute(
             feature_extractor=feature_extractor,
             device=device,
         )
-        metric = self.prepare_metric(metric)
-
+        metric = self.prepare_metric(metric, metrics_kwargs)
         # Compute predictions
         predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
         predictions = self.predictions_processor(predictions, label_mapping)
         metric_inputs.update(predictions)
-
         # Compute metrics from references and predictions
         metric_results = self.compute_metric(
             metric=metric,
@@ -151,9 +158,8 @@ def compute(
             strategy=strategy,
             confidence_level=confidence_level,
             n_resamples=n_resamples,
-            random_state=random_state,
+            random_state=random_state
         )
-
         result.update(metric_results)
         result.update(perf_results)
 

From 809cca154a466e37c0cb4f2833ad3d92bb418f5c Mon Sep 17 00:00:00 2001
From: Ilyes Djerfaf <87201310+idjerfaf@users.noreply.github.com>
Date: Wed, 5 Jun 2024 20:46:44 +0200
Subject: [PATCH 2/2] Refactoring and Review

Co-authored-by: Ilan Aliouchouche <ilan.aliouchouche@universite-paris-saclay.fr>
---
 src/evaluate/evaluator/base.py                | 24 +++++++++++--------
 src/evaluate/evaluator/text_classification.py |  9 ++++---
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/evaluate/evaluator/base.py b/src/evaluate/evaluator/base.py
index ec4bc0392..9c2cafe80 100644
--- a/src/evaluate/evaluator/base.py
+++ b/src/evaluate/evaluator/base.py
@@ -479,8 +479,11 @@ def prepare_pipeline(
 
     def prepare_metric(
         self,
-        metric: Union[str, EvaluationModule,
-                      List[str], List[EvaluationModule]],
+        metric: Union[str,
+                      EvaluationModule,
+                      List[str],
+                      List[EvaluationModule]
+                      ],
         metrics_kwargs: Optional[Dict[str, Union[Dict, List]]] = None
     ) -> List[Tuple[EvaluationModule, Dict[str, Any]]]:
         """
@@ -492,13 +495,14 @@ def prepare_metric(
                 If it is of type `str`, we treat it as the metric name, and
                 load it. Otherwise we assume it represents a pre-loaded metric.
         Returns:
-            The loaded metric.
+            The list of loaded metrics with their respective kwargs.
         Example:
         ```py
         >>> from evaluate import evaluator
         >>> evaluator("text-classification").prepare_metric("accuracy")
         ```
         """
+
         # Prepare metric.
         if metric is None:
             if self.default_metric_name is None:
@@ -507,7 +511,7 @@ def prepare_metric(
                     "Please specify a valid `metric` argument."
                 )
             metric = load(self.default_metric_name)
-        elif isinstance(metric, str) or isinstance(metric, EvaluationModule):
+        elif not isinstance(metric, list):
             em = load(metric) if isinstance(metric, str) else metric
             if metrics_kwargs and metric in metrics_kwargs:
                 if isinstance(metrics_kwargs[metric], dict):
@@ -516,18 +520,18 @@ def prepare_metric(
                     return [(em, m_) for m_ in metrics_kwargs[metric]]
             return [(em, {})]
         else:
-            metric_ = []
+            metric_list = []
             for m in metric:
                 em = load(m) if isinstance(m, str) else m
                 if metrics_kwargs and m in metrics_kwargs:
                     if isinstance(metrics_kwargs[m], dict):
-                        metric_.append((em, metrics_kwargs[m]))
+                        metric_list.append((em, metrics_kwargs[m]))
                     elif isinstance(metrics_kwargs[m], list):
-                        metric_.extend([(em, m_)
-                                        for m_ in metrics_kwargs[m]])
+                        metric_list.extend([(em, m_)
+                                            for m_ in metrics_kwargs[m]])
                 else:
-                    metric_.append((m, {}))
-            return metric_
+                    metric_list.append((m, {}))
+            return metric_list
 
     def call_pipeline(self, pipe, *args, **kwargs):
         start_time = perf_counter()
diff --git a/src/evaluate/evaluator/text_classification.py b/src/evaluate/evaluator/text_classification.py
index a1513aee8..4db412dfe 100644
--- a/src/evaluate/evaluator/text_classification.py
+++ b/src/evaluate/evaluator/text_classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from numbers import Number
-from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union, List
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 from datasets import Dataset, load_dataset
 from typing_extensions import Literal
@@ -97,8 +97,11 @@ def compute(
         data: Union[str, Dataset] = None,
         subset: Optional[str] = None,
         split: Optional[str] = None,
-        metric: Union[str, EvaluationModule,
-                      List[str], List[EvaluationModule]] = None,
+        metric: Union[str,
+                      EvaluationModule,
+                      List[str],
+                      List[EvaluationModule]
+                      ] = None,
         tokenizer: Optional[Union[str,  # noqa: F821
                                   "PreTrainedTokenizer"]] = None,
         feature_extractor: Optional[Union[str,  # noqa: F821