sotopia-lab · ProKil · Jul 2, 2024 · Jun 23, 2024 · Jun 28, 2024
diff --git a/sotopia/envs/evaluators.py b/sotopia/envs/evaluators.py
@@ -143,6 +143,20 @@ def minus_ten_to_zero_validator(cls, v: tuple[str, int]) -> tuple[str, int]:
         return v
 
 
+class EvaluationGoalOnly(BaseModel):
+    goal: tuple[str, int] = Field(
+        ...,
+        description="Please first reiterate agent's social goals. "
+        "And then please provide a comprehensive analysis about the extent to which the agent has managed to achieve these goals. "
+        "The first entry (str) of the object is the 'reasoning' field, and the second entry (int) of the object is the 'score' field. In the 'reasoning' field, provide a comprehensive account of the logic or thought process that led you to your conclusion. Further, provide an integer score ranging from 0 and 10 in the 'score' field. 0 represents minimal goals achievement, 10 represents complete goal achievement, and a higher score indicates that the agent is making progress towards their social goals.",
+    )
+
+    @validator("goal")
+    def zero_to_ten_validator(cls, v: tuple[str, int]) -> tuple[str, int]:
+        assert v[1] >= 0 and v[1] <= 10
+        return v
+
+
 class EnvResponse(BaseModel):
     agent_1_evaluation: EvaluationBySocialDimensions
     agent_2_evaluation: EvaluationBySocialDimensions
@@ -153,6 +167,11 @@ class EnvResponsePlus(BaseModel):
     agent_2_evaluation: EvaluationBySocialDimensionsPlus
 
 
+class EnvResponseGoalOnly(BaseModel):
+    agent_1_evaluation: EvaluationGoalOnly
+    agent_2_evaluation: EvaluationGoalOnly
+
+
 class Evaluator(abc.ABC):
     def __init__(self) -> None:
         pass
@@ -268,7 +287,8 @@ async def __acall__(
         response_format_class = (
             EnvResponsePlus if self.response_format == "plus" else EnvResponse
         )
-
+        if self.response_format == "goal_only":
+            response_format_class = EnvResponseGoalOnly
         try:
             response: (
                 EnvResponsePlus | EnvResponse
@@ -281,9 +301,9 @@ async def __acall__(
                     {format_instructions}
                 """,
                 input_values=dict(history=history),
-                output_parser=PydanticOutputParser[EnvResponsePlus | EnvResponse](
-                    pydantic_object=response_format_class
-                ),
+                output_parser=PydanticOutputParser[
+                    EnvResponsePlus | EnvResponse | EnvResponseGoalOnly
+                ](pydantic_object=response_format_class),
                 temperature=temperature,
             )
             response_list = []

diff --git a/tests/envs/test_evaluators.py b/tests/envs/test_evaluators.py
@@ -7,7 +7,7 @@
     RuleBasedTerminatedEvaluator,
     unweighted_aggregate_evaluate,
 )
-from sotopia.messages import AgentAction, Observation
+from sotopia.messages import AgentAction, Observation, ScriptBackground, SimpleMessage
 
 
 def test_rule_based_teminated_evaluator() -> None:
@@ -173,3 +173,67 @@ async def test_reach_goal_llm_evaluator_async() -> None:
     assert isinstance(response2[8][1][0][1], int)
     assert isinstance(response2[9][1][0][1], int)
     assert response2[2][1][0][1] > response2[3][1][0][1]
+
+
+@pytest.mark.asyncio
+async def test_reach_goal_llm_evaluator_goalonly_async() -> None:
+    evaluator = ReachGoalLLMEvaluator("gpt-4", response_format="goal_only")
+    background = ScriptBackground(
+        scenario="Conversation between two friends at a trivia night",
+        p1_name="Samuel Anderson",
+        p2_name="Giselle Rousseau",
+        p1_background="Samuel Anderson is a 29-year-old male software developer. He/him pronouns. Samuel Anderson can cook very well. Personality and values description: Samuel Anderson, though somewhat impulsive and free-spirited, values enjoyment. His decision-making is often spontaneous, staying within familiar boundaries. Samuel's secrets: He was once a competitive figure skater.",
+        p2_background="Giselle Rousseau is a 21-year-old nonbinary art student. They/them pronouns. Giselle Rousseau enjoys biking and photography. Personality and values description: Giselle Rousseau, open-minded and outgoing yet sensitive, advocates care and fairness. Her decision-making is intuitive and inclusive. Giselle's secrets: Sells forged paintings to wealthy clients",
+        p1_goal="Greet your friends and be polite",
+        p2_goal="Be rude and dismissive to your friends",
+    )
+
+    # response1,
+    response2 = await asyncio.gather(
+        evaluator.__acall__(
+            1,
+            [
+                (
+                    "Environment",
+                    background,
+                ),
+                (
+                    "Environment",
+                    SimpleMessage(message="Turn #1"),
+                ),
+                (
+                    "Alice",
+                    AgentAction(action_type="speak", argument="Thank you so much!"),
+                ),
+                (
+                    "Environment",
+                    SimpleMessage(message="Turn #2"),
+                ),
+                (
+                    "Bob",
+                    AgentAction(action_type="speak", argument="Fuck you!"),
+                ),
+                (
+                    "Environment",
+                    SimpleMessage(message="Turn #3"),
+                ),
+                (
+                    "Alice",
+                    AgentAction(
+                        action_type="speak", argument="Hope you have a great weekend."
+                    ),
+                ),
+                ("Environment", SimpleMessage(message="Turn #4")),
+                (
+                    "Bob",
+                    AgentAction(action_type="leave", argument="Leave"),
+                ),
+            ],
+        ),
+    )
+    print("---------------------")
+    print("Response after 2 turns:", response2)
+
+    assert len(response2[0][0][1][1].split()) > len(
+        "Samuel Anderson's goal was to greet his friends and be polite.".split()
+    )