Added a script to test gsm8k

instructor-ai · Aug 27, 2024 · 3d8f678 · 3d8f678
1 parent f889580
commit 3d8f678
Show file tree

Hide file tree

Showing 2 changed files with 69 additions and 2 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -22,14 +22,14 @@ huggingface-hub==0.22.2
 idna==3.7
 iniconfig==2.0.0
 inline-snapshot==0.8.0
-instructor==1.2.0
+instructor==1.4.0
 langsmith==0.1.48
 markdown-it-py==3.0.0
 mdurl==0.1.2
 multidict==6.0.5
 mypy-extensions==1.0.0
 numpy==1.26.4
-openai==1.21.2
+openai==1.40.1
 orjson==3.10.1
 packaging==24.0
 pandas==2.2.2

diff --git a/scripts/test_gsm8k.py b/scripts/test_gsm8k.py
@@ -0,0 +1,67 @@
+from braintrust import Eval, Score
+from autoevals.value import ExactMatch
+from datasets import load_dataset
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+import instructor
+from asyncio import run
+from uuid import uuid4
+
+dataset = load_dataset("567-labs/gsm8k")
+oai = AsyncOpenAI()
+
+
+class Answer(BaseModel):
+    chain_of_thought: str
+    answer: int
+
+
+modes = [instructor.Mode.TOOLS, instructor.Mode.TOOLS_STRICT]
+
+
+async def main():
+    uuid = uuid4()
+    print(f"Running eval with uuid: {uuid}")
+    for eval_mode in modes:
+        client = instructor.from_openai(oai, mode=eval_mode)
+        dataset = list(load_dataset("567-labs/gsm8k", split="test"))
+
+        async def task(question, hooks):
+            resp = await client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant that can solve math problems. Answer the question with the correct response",
+                    },
+                    {"role": "user", "content": question},
+                ],
+                response_model=Answer,
+            )
+            hooks.meta(
+                reasoning=resp.chain_of_thought,
+            )
+            return resp.answer
+
+        await Eval(
+            name="567-labs/gsm8k",
+            experiment_name=f"gsm8k-{eval_mode}-{uuid}",
+            data=lambda: [
+                {
+                    "input": row["question"],
+                    "expected": row["answer"],
+                }
+                for row in dataset
+            ],  # Replace with your eval dataset
+            task=task,
+            scores=[ExactMatch],
+            metadata={
+                "model": "gpt-4o-mini",
+                "mode": str(eval_mode),
+                "n_samples": len(dataset),
+            },
+            max_concurrency=10,
+        )
+
+
+run(main())