From ab5796496c47d2dd4a9063b61ca5f0bbb2b16e10 Mon Sep 17 00:00:00 2001 From: Zhe Su <360307598@qq.com> Date: Sun, 23 Jun 2024 11:38:38 -0400 Subject: [PATCH 1/2] add a test case for a single dimension evaluation --- sotopia/envs/evaluators.py | 28 ++++++++++++--- tests/envs/test_evaluators.py | 64 ++++++++++++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 5 deletions(-) diff --git a/sotopia/envs/evaluators.py b/sotopia/envs/evaluators.py index 01d1cc4a8..41cd14a36 100644 --- a/sotopia/envs/evaluators.py +++ b/sotopia/envs/evaluators.py @@ -143,6 +143,20 @@ def minus_ten_to_zero_validator(cls, v: tuple[str, int]) -> tuple[str, int]: return v +class EvaluationGoalOnly(BaseModel): + goal: tuple[str, int] = Field( + ..., + description="Please first reiterate agent's social goals. " + "And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. " + "In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.", + ) + + @validator("goal") + def zero_to_ten_validator(cls, v: tuple[str, int]) -> tuple[str, int]: + assert v[1] >= 0 and v[1] <= 10 + return v + + class EnvResponse(BaseModel): agent_1_evaluation: EvaluationBySocialDimensions agent_2_evaluation: EvaluationBySocialDimensions @@ -153,6 +167,11 @@ class EnvResponsePlus(BaseModel): agent_2_evaluation: EvaluationBySocialDimensionsPlus +class EnvResponseGoalOnly(BaseModel): + agent_1_evaluation: EvaluationGoalOnly + agent_2_evaluation: EvaluationGoalOnly + + class Evaluator(abc.ABC): def __init__(self) -> None: pass @@ -268,7 +287,8 @@ async def __acall__( response_format_class = ( EnvResponsePlus if self.response_format == "plus" else EnvResponse ) - + if self.response_format == "goal_only": + response_format_class = EnvResponseGoalOnly try: response: ( EnvResponsePlus | EnvResponse @@ -281,9 +301,9 @@ async def __acall__( {format_instructions} """, input_values=dict(history=history), - output_parser=PydanticOutputParser[EnvResponsePlus | EnvResponse]( - pydantic_object=response_format_class - ), + output_parser=PydanticOutputParser[ + EnvResponsePlus | EnvResponse | EnvResponseGoalOnly + ](pydantic_object=response_format_class), temperature=temperature, ) response_list = [] diff --git a/tests/envs/test_evaluators.py b/tests/envs/test_evaluators.py index b3fc6ba25..70049825b 100644 --- a/tests/envs/test_evaluators.py +++ b/tests/envs/test_evaluators.py @@ -7,7 +7,7 @@ RuleBasedTerminatedEvaluator, unweighted_aggregate_evaluate, ) -from sotopia.messages import AgentAction, Observation +from sotopia.messages import AgentAction, Observation, ScriptBackground, SimpleMessage def test_rule_based_teminated_evaluator() -> None: @@ -173,3 +173,65 @@ async def test_reach_goal_llm_evaluator_async() -> None: assert isinstance(response2[8][1][0][1], int) assert isinstance(response2[9][1][0][1], int) assert response2[2][1][0][1] > response2[3][1][0][1] + + +@pytest.mark.asyncio +async def test_reach_goal_llm_evaluator_goalonly_async() -> None: + evaluator = ReachGoalLLMEvaluator("gpt-4", response_format="goal_only") + background = ScriptBackground( + scenario="Conversation between two friends at a trivia night", + p1_name="Samuel Anderson", + p2_name="Giselle Rousseau", + p1_background="Samuel Anderson is a 29-year-old male software developer. He/him pronouns. Samuel Anderson can cook very well. Personality and values description: Samuel Anderson, though somewhat impulsive and free-spirited, values enjoyment. His decision-making is often spontaneous, staying within familiar boundaries. Samuel's secrets: He was once a competitive figure skater.", + p2_background="Giselle Rousseau is a 21-year-old nonbinary art student. They/them pronouns. Giselle Rousseau enjoys biking and photography. Personality and values description: Giselle Rousseau, open-minded and outgoing yet sensitive, advocates care and fairness. Her decision-making is intuitive and inclusive. Giselle's secrets: Sells forged paintings to wealthy clients", + p1_goal="Greet your friends and be polite", + p2_goal="Be rude and dismissive to your friends", + ) + + # response1, + response2 = await asyncio.gather( + evaluator.__acall__( + 1, + [ + ( + "Environment", + background, + ), + ( + "Environment", + SimpleMessage(message="Turn #1"), + ), + ( + "Alice", + AgentAction(action_type="speak", argument="Thank you so much!"), + ), + ( + "Environment", + SimpleMessage(message="Turn #2"), + ), + ( + "Bob", + AgentAction(action_type="speak", argument="Fuck you!"), + ), + ( + "Environment", + SimpleMessage(message="Turn #3"), + ), + ( + "Alice", + AgentAction( + action_type="speak", argument="Hope you have a great weekend." + ), + ), + ("Environment", SimpleMessage(message="Turn #4")), + ( + "Bob", + AgentAction(action_type="leave", argument="Leave"), + ), + ], + ), + ) + print("---------------------") + print("Response after 2 turns:", response2) + + assert False # Stop here to see all responses as we are not sure what should be the assertion here From 269b4f210e4f7e77cf10a8c15ab7e4c98b48ff2e Mon Sep 17 00:00:00 2001 From: XuhuiZhou Date: Thu, 27 Jun 2024 18:16:08 -0700 Subject: [PATCH 2/2] fix the single dimension bug --- sotopia/envs/evaluators.py | 2 +- tests/envs/test_evaluators.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sotopia/envs/evaluators.py b/sotopia/envs/evaluators.py index 41cd14a36..a0d56e655 100644 --- a/sotopia/envs/evaluators.py +++ b/sotopia/envs/evaluators.py @@ -148,7 +148,7 @@ class EvaluationGoalOnly(BaseModel): ..., description="Please first reiterate agent's social goals. " "And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. " - "In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.", + "The first entry (str) of the object is the 'reasoning' field, and the second entry (int) of the object is the 'score' field. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.", ) @validator("goal") diff --git a/tests/envs/test_evaluators.py b/tests/envs/test_evaluators.py index 70049825b..eacba3c4f 100644 --- a/tests/envs/test_evaluators.py +++ b/tests/envs/test_evaluators.py @@ -234,4 +234,6 @@ async def test_reach_goal_llm_evaluator_goalonly_async() -> None: print("---------------------") print("Response after 2 turns:", response2) - assert False # Stop here to see all responses as we are not sure what should be the assertion here + assert len(response2[0][0][1][1].split()) > len( + "Samuel Anderson's goal was to greet his friends and be polite.".split() + )