diff --git a/requirements.txt b/requirements.txt
index 9e617bd..13ec322 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,14 +22,14 @@ huggingface-hub==0.22.2
 idna==3.7
 iniconfig==2.0.0
 inline-snapshot==0.8.0
-instructor==1.2.0
+instructor==1.4.0
 langsmith==0.1.48
 markdown-it-py==3.0.0
 mdurl==0.1.2
 multidict==6.0.5
 mypy-extensions==1.0.0
 numpy==1.26.4
-openai==1.21.2
+openai==1.40.1
 orjson==3.10.1
 packaging==24.0
 pandas==2.2.2
diff --git a/scripts/generate_gsm8k.py b/scripts/generate_gsm8k.py
new file mode 100644
index 0000000..54b6313
--- /dev/null
+++ b/scripts/generate_gsm8k.py
@@ -0,0 +1,25 @@
+from datasets import load_dataset, Dataset, DatasetDict
+
+splits = ["test", "train"]
+
+
+def generate_gsm8k(split):
+    ds = load_dataset("gsm8k", "main", split=split, streaming=True)
+    for row in ds:
+        reasoning, answer = row["answer"].split("####")
+        answer = int(answer.strip().replace(",", ""))
+        yield {
+            "question": row["question"],
+            "answer": answer,
+            "reasoning": reasoning,
+        }
+
+
+# Create the dataset for train and test splits
+train_dataset = Dataset.from_generator(lambda: generate_gsm8k("train"))
+test_dataset = Dataset.from_generator(lambda: generate_gsm8k("test"))
+
+# Combine them into a DatasetDict
+dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
+
+dataset.push_to_hub("567-labs/gsm8k")
diff --git a/scripts/test_gsm8k.py b/scripts/test_gsm8k.py
new file mode 100644
index 0000000..87ee677
--- /dev/null
+++ b/scripts/test_gsm8k.py
@@ -0,0 +1,67 @@
+from braintrust import Eval, Score
+from autoevals.value import ExactMatch
+from datasets import load_dataset
+from openai import AsyncOpenAI
+from pydantic import BaseModel
+import instructor
+from asyncio import run
+from uuid import uuid4
+
+dataset = load_dataset("567-labs/gsm8k")
+oai = AsyncOpenAI()
+
+
+class Answer(BaseModel):
+    chain_of_thought: str
+    answer: int
+
+
+modes = [instructor.Mode.TOOLS, instructor.Mode.TOOLS_STRICT]
+
+
+async def main():
+    uuid = uuid4()
+    print(f"Running eval with uuid: {uuid}")
+    for eval_mode in modes:
+        client = instructor.from_openai(oai, mode=eval_mode)
+        dataset = list(load_dataset("567-labs/gsm8k", split="test"))
+
+        async def task(question, hooks):
+            resp = await client.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant that can solve math problems. Answer the question with the correct response",
+                    },
+                    {"role": "user", "content": question},
+                ],
+                response_model=Answer,
+            )
+            hooks.meta(
+                reasoning=resp.chain_of_thought,
+            )
+            return resp.answer
+
+        await Eval(
+            name="567-labs/gsm8k",
+            experiment_name=f"gsm8k-{eval_mode}-{uuid}",
+            data=lambda: [
+                {
+                    "input": row["question"],
+                    "expected": row["answer"],
+                }
+                for row in dataset
+            ],  # Replace with your eval dataset
+            task=task,
+            scores=[ExactMatch],
+            metadata={
+                "model": "gpt-4o-mini",
+                "mode": str(eval_mode),
+                "n_samples": len(dataset),
+            },
+            max_concurrency=10,
+        )
+
+
+run(main())