AbanteAI · mentatai · Jan 24, 2025 · Jan 24, 2025
diff --git a/benchmark/prompts.py b/benchmark/prompts.py
@@ -1,5 +1,6 @@
 from typing import List
 from benchmark.model_utils import Messages
+from benchmark.game import Game
 
 
 def create_player_messages(
@@ -34,28 +35,63 @@ def create_player_messages(
     return messages
 
 
-def create_judge_messages(green_card: str, played_cards: List[str]) -> Messages:
-    """Create messages for the judge to select a winning card.
+def create_judge_messages(game: "Game", judge_idx: int) -> Messages:
+    """Create messages for the judge to select a winning card, including game history.
 
     Args:
-        green_card: The green card (adjective) for this round
-        played_cards: List of red cards that were played
+        game: The current game state
+        judge_idx: Index of the judging player
 
     Returns:
         Messages object containing the system and user messages
     """
     messages = Messages()
     messages.add_system(
-        "You are the judge in Apples to Apples, a word association game. "
-        "In each round, there is a green card and players play red cards "
-        "that they think best match the green card. As the judge, you need to pick the best match. "
-        "IMPORTANT: Your response must be in the format: 'reasoning | card_name' where card_name "
-        "must exactly match one of the played cards."
+        "You are playing Apples to Apples, a word association game. "
+        "In each round, there is a green card (an adjective) and players play red cards (nouns) "
+        "that they think best match the green card. The judge picks the best match."
     )
 
+    # Add game history
+    for round in game.rounds[:-1]:  # All rounds except current
+        messages.add_user(
+            f"Round {round.round_number + 1} - Green Card: {round.green_card}"
+        )
+
+        # Show played cards and thinking
+        for player_idx, move in round.moves.items():
+            if player_idx == judge_idx:
+                messages.add_assistant(
+                    f"Player {player_idx + 1} (You) played: {move.played_card}\n"
+                    f"Your thinking: {move.thinking}"
+                )
+            else:
+                messages.add_user(
+                    f"Player {player_idx + 1} played: {move.played_card}\n"
+                    f"Their thinking: {move.thinking}"
+                )
+
+        # Show judge's decision
+        if round.decision:
+            if round.judge == judge_idx:
+                messages.add_assistant(
+                    f"You (as judge) selected '{round.decision.winning_card}' as the winner.\n"
+                    f"Your reasoning: {round.decision.reasoning}"
+                )
+            else:
+                messages.add_user(
+                    f"Player {round.judge + 1} (judge) selected '{round.decision.winning_card}' as the winner.\n"
+                    f"Their reasoning: {round.decision.reasoning}"
+                )
+
+    # Current round
+    current_round = game.rounds[-1]
+    played_cards = [move.played_card for move in current_round.moves.values()]
     cards_list = "\n".join(f"- {card}" for card in played_cards)
+
     messages.add_user(
-        f"The green card is: {green_card}\n"
+        f"Current Round {current_round.round_number + 1}\n"
+        f"You are the judge. The green card is: {current_round.green_card}\n"
         f"The played red cards are:\n{cards_list}\n"
         "Which red card best matches the green card? "
         "Respond with your reasoning followed by the card name, separated by ' | '. "

diff --git a/benchmark/run.py b/benchmark/run.py
@@ -113,11 +113,9 @@ def model_judge_move(game: Game, model: str) -> tuple[str, str]:
     from benchmark.prompts import create_judge_messages
 
     round = game.rounds[-1]
-    green_card = round.green_card
     moves = round.moves
     played_cards = [move.played_card for move in moves.values()]
-
-    messages = create_judge_messages(green_card, played_cards)
+    messages = create_judge_messages(game, round.judge)
 
     response = None
     try: