From ab5796496c47d2dd4a9063b61ca5f0bbb2b16e10 Mon Sep 17 00:00:00 2001
From: Zhe Su <360307598@qq.com>
Date: Sun, 23 Jun 2024 11:38:38 -0400
Subject: [PATCH 1/2] add a test case for a single dimension evaluation

---
 sotopia/envs/evaluators.py    | 28 ++++++++++++---
 tests/envs/test_evaluators.py | 64 ++++++++++++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/sotopia/envs/evaluators.py b/sotopia/envs/evaluators.py
index 01d1cc4a8..41cd14a36 100644
--- a/sotopia/envs/evaluators.py
+++ b/sotopia/envs/evaluators.py
@@ -143,6 +143,20 @@ def minus_ten_to_zero_validator(cls, v: tuple[str, int]) -> tuple[str, int]:
         return v
 
 
+class EvaluationGoalOnly(BaseModel):
+    goal: tuple[str, int] = Field(
+        ...,
+        description="Please first reiterate agent's social goals. "
+        "And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. "
+        "In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.",
+    )
+
+    @validator("goal")
+    def zero_to_ten_validator(cls, v: tuple[str, int]) -> tuple[str, int]:
+        assert v[1] >= 0 and v[1] <= 10
+        return v
+
+
 class EnvResponse(BaseModel):
     agent_1_evaluation: EvaluationBySocialDimensions
     agent_2_evaluation: EvaluationBySocialDimensions
@@ -153,6 +167,11 @@ class EnvResponsePlus(BaseModel):
     agent_2_evaluation: EvaluationBySocialDimensionsPlus
 
 
+class EnvResponseGoalOnly(BaseModel):
+    agent_1_evaluation: EvaluationGoalOnly
+    agent_2_evaluation: EvaluationGoalOnly
+
+
 class Evaluator(abc.ABC):
     def __init__(self) -> None:
         pass
@@ -268,7 +287,8 @@ async def __acall__(
         response_format_class = (
             EnvResponsePlus if self.response_format == "plus" else EnvResponse
         )
-
+        if self.response_format == "goal_only":
+            response_format_class = EnvResponseGoalOnly
         try:
             response: (
                 EnvResponsePlus | EnvResponse
@@ -281,9 +301,9 @@ async def __acall__(
                     {format_instructions}
                 """,
                 input_values=dict(history=history),
-                output_parser=PydanticOutputParser[EnvResponsePlus | EnvResponse](
-                    pydantic_object=response_format_class
-                ),
+                output_parser=PydanticOutputParser[
+                    EnvResponsePlus | EnvResponse | EnvResponseGoalOnly
+                ](pydantic_object=response_format_class),
                 temperature=temperature,
             )
             response_list = []
diff --git a/tests/envs/test_evaluators.py b/tests/envs/test_evaluators.py
index b3fc6ba25..70049825b 100644
--- a/tests/envs/test_evaluators.py
+++ b/tests/envs/test_evaluators.py
@@ -7,7 +7,7 @@
     RuleBasedTerminatedEvaluator,
     unweighted_aggregate_evaluate,
 )
-from sotopia.messages import AgentAction, Observation
+from sotopia.messages import AgentAction, Observation, ScriptBackground, SimpleMessage
 
 
 def test_rule_based_teminated_evaluator() -> None:
@@ -173,3 +173,65 @@ async def test_reach_goal_llm_evaluator_async() -> None:
     assert isinstance(response2[8][1][0][1], int)
     assert isinstance(response2[9][1][0][1], int)
     assert response2[2][1][0][1] > response2[3][1][0][1]
+
+
+@pytest.mark.asyncio
+async def test_reach_goal_llm_evaluator_goalonly_async() -> None:
+    evaluator = ReachGoalLLMEvaluator("gpt-4", response_format="goal_only")
+    background = ScriptBackground(
+        scenario="Conversation between two friends at a trivia night",
+        p1_name="Samuel Anderson",
+        p2_name="Giselle Rousseau",
+        p1_background="Samuel Anderson is a 29-year-old male software developer. He/him pronouns. Samuel Anderson can cook very well. Personality and values description: Samuel Anderson, though somewhat impulsive and free-spirited, values enjoyment. His decision-making is often spontaneous, staying within familiar boundaries. Samuel's secrets: He was once a competitive figure skater.",
+        p2_background="Giselle Rousseau is a 21-year-old nonbinary art student. They/them pronouns. Giselle Rousseau enjoys biking and photography. Personality and values description: Giselle Rousseau, open-minded and outgoing yet sensitive, advocates care and fairness. Her decision-making is intuitive and inclusive. Giselle's secrets: Sells forged paintings to wealthy clients",
+        p1_goal="Greet your friends and be polite",
+        p2_goal="Be rude and dismissive to your friends",
+    )
+
+    # response1,
+    response2 = await asyncio.gather(
+        evaluator.__acall__(
+            1,
+            [
+                (
+                    "Environment",
+                    background,
+                ),
+                (
+                    "Environment",
+                    SimpleMessage(message="Turn #1"),
+                ),
+                (
+                    "Alice",
+                    AgentAction(action_type="speak", argument="Thank you so much!"),
+                ),
+                (
+                    "Environment",
+                    SimpleMessage(message="Turn #2"),
+                ),
+                (
+                    "Bob",
+                    AgentAction(action_type="speak", argument="Fuck you!"),
+                ),
+                (
+                    "Environment",
+                    SimpleMessage(message="Turn #3"),
+                ),
+                (
+                    "Alice",
+                    AgentAction(
+                        action_type="speak", argument="Hope you have a great weekend."
+                    ),
+                ),
+                ("Environment", SimpleMessage(message="Turn #4")),
+                (
+                    "Bob",
+                    AgentAction(action_type="leave", argument="Leave"),
+                ),
+            ],
+        ),
+    )
+    print("---------------------")
+    print("Response after 2 turns:", response2)
+
+    assert False  # Stop here to see all responses as we are not sure what should be the assertion here

From 269b4f210e4f7e77cf10a8c15ab7e4c98b48ff2e Mon Sep 17 00:00:00 2001
From: XuhuiZhou <zhouxuhui2018@gmail.com>
Date: Thu, 27 Jun 2024 18:16:08 -0700
Subject: [PATCH 2/2] fix the single dimension bug

---
 sotopia/envs/evaluators.py    | 2 +-
 tests/envs/test_evaluators.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sotopia/envs/evaluators.py b/sotopia/envs/evaluators.py
index 41cd14a36..a0d56e655 100644
--- a/sotopia/envs/evaluators.py
+++ b/sotopia/envs/evaluators.py
@@ -148,7 +148,7 @@ class EvaluationGoalOnly(BaseModel):
         ...,
         description="Please first reiterate agent's social goals. "
         "And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. "
-        "In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.",
+        "The first entry (str) of the object is the 'reasoning' field, and the second entry (int) of the object is the 'score' field. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.",
     )
 
     @validator("goal")
diff --git a/tests/envs/test_evaluators.py b/tests/envs/test_evaluators.py
index 70049825b..eacba3c4f 100644
--- a/tests/envs/test_evaluators.py
+++ b/tests/envs/test_evaluators.py
@@ -234,4 +234,6 @@ async def test_reach_goal_llm_evaluator_goalonly_async() -> None:
     print("---------------------")
     print("Response after 2 turns:", response2)
 
-    assert False  # Stop here to see all responses as we are not sure what should be the assertion here
+    assert len(response2[0][0][1][1].split()) > len(
+        "Samuel Anderson's goal was to greet his friends and be polite.".split()
+    )