Skip to content

Commit

Permalink
Added a script to test gsm8k
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanleomk committed Aug 27, 2024
1 parent f889580 commit 3d8f678
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 2 deletions.
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ huggingface-hub==0.22.2
idna==3.7
iniconfig==2.0.0
inline-snapshot==0.8.0
instructor==1.2.0
instructor==1.4.0
langsmith==0.1.48
markdown-it-py==3.0.0
mdurl==0.1.2
multidict==6.0.5
mypy-extensions==1.0.0
numpy==1.26.4
openai==1.21.2
openai==1.40.1
orjson==3.10.1
packaging==24.0
pandas==2.2.2
Expand Down
67 changes: 67 additions & 0 deletions scripts/test_gsm8k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from braintrust import Eval, Score
from autoevals.value import ExactMatch
from datasets import load_dataset
from openai import AsyncOpenAI
from pydantic import BaseModel
import instructor
from asyncio import run
from uuid import uuid4

dataset = load_dataset("567-labs/gsm8k")
oai = AsyncOpenAI()


class Answer(BaseModel):
chain_of_thought: str
answer: int


modes = [instructor.Mode.TOOLS, instructor.Mode.TOOLS_STRICT]


async def main():
uuid = uuid4()
print(f"Running eval with uuid: {uuid}")
for eval_mode in modes:
client = instructor.from_openai(oai, mode=eval_mode)
dataset = list(load_dataset("567-labs/gsm8k", split="test"))

async def task(question, hooks):
resp = await client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that can solve math problems. Answer the question with the correct response",
},
{"role": "user", "content": question},
],
response_model=Answer,
)
hooks.meta(
reasoning=resp.chain_of_thought,
)
return resp.answer

await Eval(
name="567-labs/gsm8k",
experiment_name=f"gsm8k-{eval_mode}-{uuid}",
data=lambda: [
{
"input": row["question"],
"expected": row["answer"],
}
for row in dataset
], # Replace with your eval dataset
task=task,
scores=[ExactMatch],
metadata={
"model": "gpt-4o-mini",
"mode": str(eval_mode),
"n_samples": len(dataset),
},
max_concurrency=10,
)


run(main())

0 comments on commit 3d8f678

Please sign in to comment.