Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a test case for a single dimension evaluation #123

Merged
merged 2 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions sotopia/envs/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,20 @@ def minus_ten_to_zero_validator(cls, v: tuple[str, int]) -> tuple[str, int]:
return v


class EvaluationGoalOnly(BaseModel):
goal: tuple[str, int] = Field(
...,
description="Please first reiterate agent's social goals. "
"And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. "
"The first entry (str) of the object is the 'reasoning' field, and the second entry (int) of the object is the 'score' field. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.",
)

@validator("goal")
def zero_to_ten_validator(cls, v: tuple[str, int]) -> tuple[str, int]:
assert v[1] >= 0 and v[1] <= 10
return v


class EnvResponse(BaseModel):
agent_1_evaluation: EvaluationBySocialDimensions
agent_2_evaluation: EvaluationBySocialDimensions
Expand All @@ -153,6 +167,11 @@ class EnvResponsePlus(BaseModel):
agent_2_evaluation: EvaluationBySocialDimensionsPlus


class EnvResponseGoalOnly(BaseModel):
agent_1_evaluation: EvaluationGoalOnly
agent_2_evaluation: EvaluationGoalOnly


class Evaluator(abc.ABC):
def __init__(self) -> None:
pass
Expand Down Expand Up @@ -268,7 +287,8 @@ async def __acall__(
response_format_class = (
EnvResponsePlus if self.response_format == "plus" else EnvResponse
)

if self.response_format == "goal_only":
ProKil marked this conversation as resolved.
Show resolved Hide resolved
response_format_class = EnvResponseGoalOnly
try:
response: (
EnvResponsePlus | EnvResponse
Expand All @@ -281,9 +301,9 @@ async def __acall__(
{format_instructions}
""",
input_values=dict(history=history),
output_parser=PydanticOutputParser[EnvResponsePlus | EnvResponse](
pydantic_object=response_format_class
),
output_parser=PydanticOutputParser[
EnvResponsePlus | EnvResponse | EnvResponseGoalOnly
](pydantic_object=response_format_class),
temperature=temperature,
)
response_list = []
Expand Down
66 changes: 65 additions & 1 deletion tests/envs/test_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
RuleBasedTerminatedEvaluator,
unweighted_aggregate_evaluate,
)
from sotopia.messages import AgentAction, Observation
from sotopia.messages import AgentAction, Observation, ScriptBackground, SimpleMessage


def test_rule_based_teminated_evaluator() -> None:
Expand Down Expand Up @@ -173,3 +173,67 @@ async def test_reach_goal_llm_evaluator_async() -> None:
assert isinstance(response2[8][1][0][1], int)
assert isinstance(response2[9][1][0][1], int)
assert response2[2][1][0][1] > response2[3][1][0][1]


@pytest.mark.asyncio
async def test_reach_goal_llm_evaluator_goalonly_async() -> None:
evaluator = ReachGoalLLMEvaluator("gpt-4", response_format="goal_only")
background = ScriptBackground(
scenario="Conversation between two friends at a trivia night",
p1_name="Samuel Anderson",
p2_name="Giselle Rousseau",
p1_background="Samuel Anderson is a 29-year-old male software developer. He/him pronouns. Samuel Anderson can cook very well. Personality and values description: Samuel Anderson, though somewhat impulsive and free-spirited, values enjoyment. His decision-making is often spontaneous, staying within familiar boundaries. Samuel's secrets: He was once a competitive figure skater.",
p2_background="Giselle Rousseau is a 21-year-old nonbinary art student. They/them pronouns. Giselle Rousseau enjoys biking and photography. Personality and values description: Giselle Rousseau, open-minded and outgoing yet sensitive, advocates care and fairness. Her decision-making is intuitive and inclusive. Giselle's secrets: Sells forged paintings to wealthy clients",
p1_goal="Greet your friends and be polite",
p2_goal="Be rude and dismissive to your friends",
)

# response1,
response2 = await asyncio.gather(
evaluator.__acall__(
1,
[
(
"Environment",
background,
),
(
"Environment",
SimpleMessage(message="Turn #1"),
),
(
"Alice",
AgentAction(action_type="speak", argument="Thank you so much!"),
),
(
"Environment",
SimpleMessage(message="Turn #2"),
),
(
"Bob",
AgentAction(action_type="speak", argument="Fuck you!"),
),
(
"Environment",
SimpleMessage(message="Turn #3"),
),
(
"Alice",
AgentAction(
action_type="speak", argument="Hope you have a great weekend."
),
),
("Environment", SimpleMessage(message="Turn #4")),
(
"Bob",
AgentAction(action_type="leave", argument="Leave"),
),
],
),
)
print("---------------------")
print("Response after 2 turns:", response2)

assert len(response2[0][0][1][1].split()) > len(
"Samuel Anderson's goal was to greet his friends and be polite.".split()
)
Loading