update w feedback

sotopia-lab · Jun 14, 2024 · 8352ea7 · 8352ea7
1 parent df1bfde
commit 8352ea7
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 21 deletions.
diff --git a/docs/pages/benchmark.md b/docs/pages/benchmark.md
@@ -1,6 +1,11 @@
 # Benchmark your model as a social agent in Sotopia
 
+```
+sotopia_benchmark --model=<your_model_name>
+```
+or 
+
 ```
 python sotopia/benchmark/cli.py --model=<your_model_name>
 ```
-Currently this script would run over 100 simulations on the Sotopia Hard tasks. And the partner model is fixed to be `together_ai/meta-llama/Llama-2-70b-chat-hf`
+Currently this script would run over 100 simulations on the Sotopia Hard tasks. And the partner model is fixed to be `meta-llama/Llama-3-70b-chat-hf`
diff --git a/docs/pages/examples.mdx b/docs/pages/examples.mdx
@@ -19,22 +19,3 @@ python examples/benchmark_evaluator.py --push-to-db --model=<the model used to b
 
 ## Example 2: Generate script-like episodes
 See `docs/simulation_modes.md` for more information.
-
-## Example 3: Benchmarking the models as social agents
-
-```Bash
-EVAL_MODEL="gpt-4o-2024-05-13"
-python examples/benchmark_social_agents.py \
- --gin_file sotopia_conf/generation_utils_conf/generate.gin \
- --gin_file sotopia_conf/server_conf/server.gin \
- --gin_file sotopia_conf/run_async_server_in_batch.gin \
- '--gin.ENV_IDS=[]' \
- '--gin.AGENT1_MODEL="groq/llama3-70b-8192"' \
- "--gin.AGENT2_MODEL=${EVAL_MODEL}" \
- '--gin.BATCH_SIZE=10' \
- "--gin.TAG=benchmark_${EVAL_MODEL}" \
- "--gin.TAG_TO_CHECK_EXISTING_EPISODES=benchmark_${EVAL_MODEL}" \
- '--gin.PUSH_TO_DB=True' \
- '--gin.OMNISCIENT=False' \
- '--gin.VERBOSE=False'
-```
diff --git a/sotopia/benchmark/cli.py b/sotopia/benchmark/cli.py
@@ -259,7 +259,7 @@ def _set_up_logs(
 def cli(
     model: str = typer.Option(..., help="The language model you want to benchmark."),
     partner_model: str = typer.Option(
-        "together_ai/meta-llama/Llama-2-70b-chat-hf",
+        "meta-llama/Llama-3-70b-chat-hf",
         help="The partner model you want to use.",
     ),
     evaluator_model: str = typer.Option(