From feed4779119e7d9d3057c1266a078d394ad44ec9 Mon Sep 17 00:00:00 2001 From: Niels Date: Tue, 8 Oct 2024 22:40:54 +0200 Subject: [PATCH] Add script --- upload_dataset.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 upload_dataset.py diff --git a/upload_dataset.py b/upload_dataset.py new file mode 100644 index 0000000..0688ae5 --- /dev/null +++ b/upload_dataset.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_from_disk +import json +import glob + +ds = Dataset.from_dict({"path": list(glob.glob("data/wit_*.json"))}) +ds.save_to_disk("tmp") +# now reload from disk and read the files +ds = load_from_disk("tmp") + +def read(batch): + urls = [] + captions = [] + for path in batch["path"]: + with open(path) as f: + for k, v in json.load(f).items(): + urls.append(k) + captions.append(v) + + new_batch = {"url": urls, "caption": captions} + + return new_batch + +ds = ds.map(read, batched=True, batch_size=1, remove_columns=["path"]) + +print(ds) + +ds.push_to_hub("nielsr/wit_300m") \ No newline at end of file