Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add llm as judge in metrics #146

Merged
merged 14 commits into from
Apr 11, 2024
30 changes: 30 additions & 0 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
BertScore,
ExactMatches,
F1_score,
JudgeLLM,
LoglikelihoodAcc,
Recall,
StringDistance,
Expand Down Expand Up @@ -224,6 +225,35 @@ class Metrics(Enum):
corpus_level_fn=np.mean,
higher_is_better=True,
)
llm_judge_multi_turn = SampleLevelMetricGrouping(
clefourrier marked this conversation as resolved.
Show resolved Hide resolved
metric=["single_turn", "multi_turn"],
higher_is_better=True,
category=MetricCategory.GENERATIVE_MULTI_TURN,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="gpt-3.5-turbo",
template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to put the prompts somewhere else? maybe have a folder for the prompt examples in the future?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes probably, we are gonna move mt-bench to lighteval tasks anyway

multi_turn=True,
).compute,
corpus_level_fn={
"single_turn": np.mean,
"multi_turn": np.mean,
},
)
llm_judge = SampleLevelMetricGrouping(
metric=["judge_score"],
higher_is_better=True,
category=MetricCategory.GENERATIVE,
use_case=MetricUseCase.SUMMARIZATION,
sample_level_fn=JudgeLLM(
judge_model_name="gpt-3.5-turbo",
template_path="src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl",
multi_turn=False,
).compute,
corpus_level_fn={
"judge_score": np.mean,
},
)
loglikelihood_acc = SampleLevelMetric(
metric="acc",
sample_level_fn=LoglikelihoodAcc().compute,
Expand Down
57 changes: 57 additions & 0 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"""This module manages all the metrics occurring at the sample level. The results of said metrics are then aggregated
using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category.
"""
import os
from typing import Union

import nltk
Expand All @@ -38,6 +39,7 @@
from lighteval.metrics.imports.bert_scorer import BERTScorer
from lighteval.metrics.imports.data_stats_metric import DataStatsMetric
from lighteval.metrics.imports.summac import SummaCZS
from lighteval.metrics.llm_as_judge import JudgeOpenAI
from lighteval.metrics.normalizations import remove_braces, remove_braces_and_strip
from lighteval.tasks.requests import Doc
from lighteval.utils import as_list
Expand Down Expand Up @@ -616,3 +618,58 @@ def edit_similarity(self, s1, s2):
"""
edist = edit_distance(s1, s2)
return 1.0 - edist / max(len(s1), len(s2)) if len(s1) > 0 and len(s2) > 0 else 0


class JudgeLLM:
available_models = ["gpt-3.5-turbo"]
NathanHB marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, judge_model_name: str, template_path: str, multi_turn: bool = False):
if judge_model_name not in self.available_models:
raise ValueError(f"{judge_model_name} not in available models for llm as a judge metric")

self.template_path = "src/lighteval/tasks/extended/mt_bench/judge_prompts.jsonl"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
self.multi_turn = multi_turn

try:
self.judge = JudgeOpenAI(
model=judge_model_name,
seed=42,
temperature=0.0,
templates_path=self.template_path,
openai_api_key=OPENAI_API_KEY,
multi_turn=multi_turn,
)
except Exception as e:
print(f"Could not initialize the JudgeOpenAI model:\n{e}")

def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
"""Defines how to go from a list of predictions to a score.
NathanHB marked this conversation as resolved.
Show resolved Hide resolved
Follow examples in src/lighteval/metrics/metrics.py, or get more info
about what this function should do in the README.
"""

# If we are evaluating a multiturn task, we need to have specific field in the formated doc
if self.multi_turn:
questions = formatted_doc.specific["multi_turn_queries"]
ref_answers = formatted_doc.specific.get("reference", None) if formatted_doc.specific is not None else None
else:
questions = [formatted_doc.query]
ref_answers = [formatted_doc.choices[formatted_doc.gold_index]]

scores, messages, judgements = self.judge.evaluate_answer(questions, predictions, ref_answers)

# Multi turn only has 2 turns
if self.multi_turn:
return {
"single_turn": scores[0],
"multi_turn": scores[1],
"user_prompt": [messages[0], messages[1]],
"judgement": [judgements[0], judgements[1]],
}

return {
"judge_score": scores[0],
"user_prompt": messages[0],
"judgement": judgements[0],
}
222 changes: 0 additions & 222 deletions src/lighteval/tasks/extended/mt_bench/judges.py

This file was deleted.

Loading
Loading