Skip to content

Commit

Permalink
Merge pull request #9 from ivanleomk/gsm-8k
Browse files Browse the repository at this point in the history
Adding a GSM8K Script
  • Loading branch information
jxnl authored Aug 27, 2024
2 parents f889580 + a4f0b5d commit 04d2663
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 2 deletions.
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ huggingface-hub==0.22.2
idna==3.7
iniconfig==2.0.0
inline-snapshot==0.8.0
instructor==1.2.0
instructor==1.4.0
langsmith==0.1.48
markdown-it-py==3.0.0
mdurl==0.1.2
multidict==6.0.5
mypy-extensions==1.0.0
numpy==1.26.4
openai==1.21.2
openai==1.40.1
orjson==3.10.1
packaging==24.0
pandas==2.2.2
Expand Down
25 changes: 25 additions & 0 deletions scripts/generate_gsm8k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from datasets import load_dataset, Dataset, DatasetDict

splits = ["test", "train"]


def generate_gsm8k(split):
ds = load_dataset("gsm8k", "main", split=split, streaming=True)
for row in ds:
reasoning, answer = row["answer"].split("####")
answer = int(answer.strip().replace(",", ""))
yield {
"question": row["question"],
"answer": answer,
"reasoning": reasoning,
}


# Create the dataset for train and test splits
train_dataset = Dataset.from_generator(lambda: generate_gsm8k("train"))
test_dataset = Dataset.from_generator(lambda: generate_gsm8k("test"))

# Combine them into a DatasetDict
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

dataset.push_to_hub("567-labs/gsm8k")
67 changes: 67 additions & 0 deletions scripts/test_gsm8k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from braintrust import Eval, Score
from autoevals.value import ExactMatch
from datasets import load_dataset
from openai import AsyncOpenAI
from pydantic import BaseModel
import instructor
from asyncio import run
from uuid import uuid4

dataset = load_dataset("567-labs/gsm8k")
oai = AsyncOpenAI()


class Answer(BaseModel):
chain_of_thought: str
answer: int


modes = [instructor.Mode.TOOLS, instructor.Mode.TOOLS_STRICT]


async def main():
uuid = uuid4()
print(f"Running eval with uuid: {uuid}")
for eval_mode in modes:
client = instructor.from_openai(oai, mode=eval_mode)
dataset = list(load_dataset("567-labs/gsm8k", split="test"))

async def task(question, hooks):
resp = await client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that can solve math problems. Answer the question with the correct response",
},
{"role": "user", "content": question},
],
response_model=Answer,
)
hooks.meta(
reasoning=resp.chain_of_thought,
)
return resp.answer

await Eval(
name="567-labs/gsm8k",
experiment_name=f"gsm8k-{eval_mode}-{uuid}",
data=lambda: [
{
"input": row["question"],
"expected": row["answer"],
}
for row in dataset
], # Replace with your eval dataset
task=task,
scores=[ExactMatch],
metadata={
"model": "gpt-4o-mini",
"mode": str(eval_mode),
"n_samples": len(dataset),
},
max_concurrency=10,
)


run(main())

0 comments on commit 04d2663

Please sign in to comment.