Skip to content

Commit

Permalink
Made judge response processing more robust.
Browse files Browse the repository at this point in the history
  • Loading branch information
JoelNiklaus committed Jan 7, 2025
1 parent f6fee3a commit f3decfd
Showing 1 changed file with 22 additions and 7 deletions.
29 changes: 22 additions & 7 deletions src/lighteval/tasks/extended/mix_eval/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import logging
import re

import numpy as np
Expand All @@ -37,6 +38,9 @@
from lighteval.tasks.requests import Doc


logger = logging.getLogger(__name__)


def mixeval_freeform_prompt(line, task_name: str = ""):
prompt = construct_prompt_freeform(line)
return Doc(
Expand Down Expand Up @@ -71,19 +75,30 @@ def mixeval_multichoice_prompt(line, task_name: str = ""):


def process_judge_response(x):
search = re.search(r"<score>\s(\d)\s</score>", x)
return int(search.group(1)) if search else 0
try:
search = re.search(r"<score>\s(\d)\s</score>", x)
return int(search.group(1)) if search else 0
except Exception as e:
logger.warning(f"Error processing judge response for flow: {e}")
return 0


def process_judge_response_multichoice_gpt(x):
search = re.search(r"\[\[([01])\]\]", x)
return int(search.group(1)) if search else 0
try:
search = re.search(r"\[\[([01])\]\]", x)
return int(search.group(1)) if search else 0
except Exception as e:
logger.warning(f"Error processing judge response for multichoice GPT: {e}")
return 0


def process_judge_response_freeform_gpt(x):
search = re.search(r"\[\[(\d.\d)\]\]", x)
answer = float(search.group(1) if search else 0)
return answer
try:
search = re.search(r"\[\[(\d.\d)\]\]", x)
return float(search.group(1)) if search else 0
except Exception as e:
logger.warning(f"Error processing judge response for freeform GPT: {e}")
return 0


llm_judge_mixeval_multichoice_flow_judge = SampleLevelMetricGrouping(
Expand Down

0 comments on commit f3decfd

Please sign in to comment.