diff --git a/requirements.txt b/requirements.txt index 9e617bd..13ec322 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,14 +22,14 @@ huggingface-hub==0.22.2 idna==3.7 iniconfig==2.0.0 inline-snapshot==0.8.0 -instructor==1.2.0 +instructor==1.4.0 langsmith==0.1.48 markdown-it-py==3.0.0 mdurl==0.1.2 multidict==6.0.5 mypy-extensions==1.0.0 numpy==1.26.4 -openai==1.21.2 +openai==1.40.1 orjson==3.10.1 packaging==24.0 pandas==2.2.2 diff --git a/scripts/generate_gsm8k.py b/scripts/generate_gsm8k.py new file mode 100644 index 0000000..54b6313 --- /dev/null +++ b/scripts/generate_gsm8k.py @@ -0,0 +1,25 @@ +from datasets import load_dataset, Dataset, DatasetDict + +splits = ["test", "train"] + + +def generate_gsm8k(split): + ds = load_dataset("gsm8k", "main", split=split, streaming=True) + for row in ds: + reasoning, answer = row["answer"].split("####") + answer = int(answer.strip().replace(",", "")) + yield { + "question": row["question"], + "answer": answer, + "reasoning": reasoning, + } + + +# Create the dataset for train and test splits +train_dataset = Dataset.from_generator(lambda: generate_gsm8k("train")) +test_dataset = Dataset.from_generator(lambda: generate_gsm8k("test")) + +# Combine them into a DatasetDict +dataset = DatasetDict({"train": train_dataset, "test": test_dataset}) + +dataset.push_to_hub("567-labs/gsm8k") diff --git a/scripts/test_gsm8k.py b/scripts/test_gsm8k.py new file mode 100644 index 0000000..87ee677 --- /dev/null +++ b/scripts/test_gsm8k.py @@ -0,0 +1,67 @@ +from braintrust import Eval, Score +from autoevals.value import ExactMatch +from datasets import load_dataset +from openai import AsyncOpenAI +from pydantic import BaseModel +import instructor +from asyncio import run +from uuid import uuid4 + +dataset = load_dataset("567-labs/gsm8k") +oai = AsyncOpenAI() + + +class Answer(BaseModel): + chain_of_thought: str + answer: int + + +modes = [instructor.Mode.TOOLS, instructor.Mode.TOOLS_STRICT] + + +async def main(): + uuid = uuid4() + print(f"Running eval with uuid: {uuid}") + for eval_mode in modes: + client = instructor.from_openai(oai, mode=eval_mode) + dataset = list(load_dataset("567-labs/gsm8k", split="test")) + + async def task(question, hooks): + resp = await client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that can solve math problems. Answer the question with the correct response", + }, + {"role": "user", "content": question}, + ], + response_model=Answer, + ) + hooks.meta( + reasoning=resp.chain_of_thought, + ) + return resp.answer + + await Eval( + name="567-labs/gsm8k", + experiment_name=f"gsm8k-{eval_mode}-{uuid}", + data=lambda: [ + { + "input": row["question"], + "expected": row["answer"], + } + for row in dataset + ], # Replace with your eval dataset + task=task, + scores=[ExactMatch], + metadata={ + "model": "gpt-4o-mini", + "mode": str(eval_mode), + "n_samples": len(dataset), + }, + max_concurrency=10, + ) + + +run(main())