From f3decfd2fc0ed48921655c969c0ba43356e7ba0e Mon Sep 17 00:00:00 2001 From: Joel Niklaus Date: Tue, 7 Jan 2025 08:47:39 -0800 Subject: [PATCH] Made judge response processing more robust. --- src/lighteval/tasks/extended/mix_eval/main.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/lighteval/tasks/extended/mix_eval/main.py b/src/lighteval/tasks/extended/mix_eval/main.py index 8684e910c..eaa58f2a5 100644 --- a/src/lighteval/tasks/extended/mix_eval/main.py +++ b/src/lighteval/tasks/extended/mix_eval/main.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import logging import re import numpy as np @@ -37,6 +38,9 @@ from lighteval.tasks.requests import Doc +logger = logging.getLogger(__name__) + + def mixeval_freeform_prompt(line, task_name: str = ""): prompt = construct_prompt_freeform(line) return Doc( @@ -71,19 +75,30 @@ def mixeval_multichoice_prompt(line, task_name: str = ""): def process_judge_response(x): - search = re.search(r"\s(\d)\s", x) - return int(search.group(1)) if search else 0 + try: + search = re.search(r"\s(\d)\s", x) + return int(search.group(1)) if search else 0 + except Exception as e: + logger.warning(f"Error processing judge response for flow: {e}") + return 0 def process_judge_response_multichoice_gpt(x): - search = re.search(r"\[\[([01])\]\]", x) - return int(search.group(1)) if search else 0 + try: + search = re.search(r"\[\[([01])\]\]", x) + return int(search.group(1)) if search else 0 + except Exception as e: + logger.warning(f"Error processing judge response for multichoice GPT: {e}") + return 0 def process_judge_response_freeform_gpt(x): - search = re.search(r"\[\[(\d.\d)\]\]", x) - answer = float(search.group(1) if search else 0) - return answer + try: + search = re.search(r"\[\[(\d.\d)\]\]", x) + return float(search.group(1)) if search else 0 + except Exception as e: + logger.warning(f"Error processing judge response for freeform GPT: {e}") + return 0 llm_judge_mixeval_multichoice_flow_judge = SampleLevelMetricGrouping(