Skip to content

Commit

Permalink
Adding a generate script
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanleomk committed Aug 27, 2024
1 parent 3d8f678 commit a4f0b5d
Showing 1 changed file with 25 additions and 0 deletions.
25 changes: 25 additions & 0 deletions scripts/generate_gsm8k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from datasets import load_dataset, Dataset, DatasetDict

splits = ["test", "train"]


def generate_gsm8k(split):
ds = load_dataset("gsm8k", "main", split=split, streaming=True)
for row in ds:
reasoning, answer = row["answer"].split("####")
answer = int(answer.strip().replace(",", ""))
yield {
"question": row["question"],
"answer": answer,
"reasoning": reasoning,
}


# Create the dataset for train and test splits
train_dataset = Dataset.from_generator(lambda: generate_gsm8k("train"))
test_dataset = Dataset.from_generator(lambda: generate_gsm8k("test"))

# Combine them into a DatasetDict
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

dataset.push_to_hub("567-labs/gsm8k")

0 comments on commit a4f0b5d

Please sign in to comment.