From 426c964b86f379d19c008aa050a0b9ec5690e710 Mon Sep 17 00:00:00 2001 From: ccl-core <91942859+ccl-core@users.noreply.github.com> Date: Wed, 23 Oct 2024 16:05:25 +0200 Subject: [PATCH] Make field more robust with None/nan repeated input (#757) * Changes in field (more modular, more robust with none/nan values). * Add test with csv with nan values. * Add https://huggingface.co/datasets/teknium/OpenHermes-2.5?row=5 as an example of a dataset with repeated Null values for e2e test. --- datasets/0.8/pass-mini/output/images.jsonl | 12 +- .../output/levanti_train.jsonl | 20 +- .../1.0/huggingface-open-hermes/metadata.json | 323 ++++++++++++++++++ .../output/default.jsonl | 3 + datasets/1.0/pass-mini/output/images.jsonl | 12 +- .../mlcroissant/_src/datasets_test.py | 1 + .../_src/operation_graph/operations/field.py | 60 +++- .../operation_graph/operations/field_test.py | 13 + 8 files changed, 404 insertions(+), 40 deletions(-) create mode 100644 datasets/1.0/huggingface-open-hermes/metadata.json create mode 100644 datasets/1.0/huggingface-open-hermes/output/default.jsonl diff --git a/datasets/0.8/pass-mini/output/images.jsonl b/datasets/0.8/pass-mini/output/images.jsonl index 3c76c74a8..dcf704f20 100644 --- a/datasets/0.8/pass-mini/output/images.jsonl +++ b/datasets/0.8/pass-mini/output/images.jsonl @@ -1,8 +1,8 @@ -{"creator_uname": "PaperBird+Photography%3C3", "latitude": null, "longitude": null, "date_taken": "2007-05-06 06:11:48", "hash": "75f7305b1fd94044e14bdcdde469dbb2", "image_content": ">"} +{"creator_uname": "PaperBird+Photography%3C3", "latitude": "None", "longitude": "None", "date_taken": "2007-05-06 06:11:48", "hash": "75f7305b1fd94044e14bdcdde469dbb2", "image_content": ">"} {"creator_uname": "Chiara+Marra", "latitude": 38.23818, "longitude": 13.183593, "date_taken": "2007-05-04 15:46:43", "hash": "dd571a41a015354d92a859f7ef31201", "image_content": ">"} -{"creator_uname": "maplesbranch", "latitude": null, "longitude": null, "date_taken": "2006-05-01 07:34:13", "hash": "598ad3bc7e6e876e61af116693c7ad9", "image_content": ">"} -{"creator_uname": "maplesbranch", "latitude": null, "longitude": null, "date_taken": "2006-04-23 19:20:40", "hash": "e48d6d552465c5728585b82a53d6e02c", "image_content": ">"} -{"creator_uname": "quinnums", "latitude": null, "longitude": null, "date_taken": "2004-05-17 00:44:29", "hash": "ffd3eb12a16cb83138f26e6f36dec967", "image_content": ">"} +{"creator_uname": "maplesbranch", "latitude": "None", "longitude": "None", "date_taken": "2006-05-01 07:34:13", "hash": "598ad3bc7e6e876e61af116693c7ad9", "image_content": ">"} +{"creator_uname": "maplesbranch", "latitude": "None", "longitude": "None", "date_taken": "2006-04-23 19:20:40", "hash": "e48d6d552465c5728585b82a53d6e02c", "image_content": ">"} +{"creator_uname": "quinnums", "latitude": "None", "longitude": "None", "date_taken": "2004-05-17 00:44:29", "hash": "ffd3eb12a16cb83138f26e6f36dec967", "image_content": ">"} {"creator_uname": "striatic", "latitude": 53.535233, "longitude": -113.565075, "date_taken": "2004-05-11 02:00:33", "hash": "fff0eece99cc71c2e91fe716051599", "image_content": ">"} -{"creator_uname": "striatic", "latitude": null, "longitude": null, "date_taken": "2004-05-27 10:34:28", "hash": "fedefe9f11bf2a749a749bfca8bf28", "image_content": ">"} -{"creator_uname": "quinnums", "latitude": null, "longitude": null, "date_taken": "2004-05-29 02:14:36", "hash": "ff379727f52bcec4dfb237ace41627", "image_content": ">"} +{"creator_uname": "striatic", "latitude": "None", "longitude": "None", "date_taken": "2004-05-27 10:34:28", "hash": "fedefe9f11bf2a749a749bfca8bf28", "image_content": ">"} +{"creator_uname": "quinnums", "latitude": "None", "longitude": "None", "date_taken": "2004-05-29 02:14:36", "hash": "ff379727f52bcec4dfb237ace41627", "image_content": ">"} diff --git a/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl b/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl index e6e859717..6dcef5645 100644 --- a/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl +++ b/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl @@ -1,10 +1,10 @@ -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0627\u0646\u0627 \u062d\u0627\u0628\u0628 \u0627\u062d\u0643\u064a \u0645\u0639\u0643.", "levanti/hebrew": "\u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6\u05d4 \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da.", "levanti/english": "I want to talk to you.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0645\u0631\u062d\u0628\u0627 \u0623\u0631\u064a\u062c. -\u0645\u0631\u062d\u0628\u0627 \u062c\u0644\u0627\u0644.", "levanti/hebrew": "\u05e9\u05dc\u05d5\u05dd, \u05d0\u05e8\u05d9\u05d2'. -\u05e9\u05dc\u05d5\u05dd, \u05d2'\u05dc\u05d0\u05dc.", "levanti/english": "Hello, Areej. -Hello, Jalal.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0648\u0625\u0630\u0627 \u0645\u0634 \u0646\u0633\u0627\u064a\u0628\u0646\u0627 \u0648\u0644\u0627\u062f \u0639\u0645\u0651\u0646\u0627!", "levanti/hebrew": "\u05d5\u05d0\u05dd \u05dc\u05d0 \u05e7\u05e8\u05d5\u05d1\u05d9\u05dd \u05e9\u05dc\u05e0\u05d5 \u05d4\u05dd \u05d1\u05e0\u05d9 \u05d3\u05d5\u05d3\u05d9\u05e0\u05d5!", "levanti/english": "And if they are not our relatives, they are our cousins!", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Jordanian", "levanti/arabic": "\u0647\u0645 \u0645\u0634 \u0639\u0627\u062c\u0628\u0647\u0645 \u0625\u0646\u0648 \u0645\u062f\u064a\u0631\u062a\u0647\u0645 \u0633\u062a", "levanti/hebrew": "\u05d4\u05dd \u05dc\u05d0 \u05de\u05e8\u05d5\u05e6\u05d9\u05dd \u05de\u05db\u05da \u05e9\u05d4\u05de\u05e0\u05d4\u05dc\u05ea \u05e9\u05dc\u05d4\u05dd \u05d0\u05d9\u05e9\u05d4", "levanti/english": "They don't like that their manager is a woman.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Syrian", "levanti/arabic": "\u200f\u0627\u0644\u0634\u0628\u0643\u0627\u062a \u0643\u0627\u0646\u062a \u0645\u0644\u0627\u0646\u0629 \u0633\u0645\u0643 \u200e", "levanti/hebrew": "\u05d4\u05e8\u05e9\u05ea\u05d5\u05ea \u05d4\u05d9\u05d5 \u05de\u05dc\u05d0\u05d5\u05ea \u05d3\u05d2\u05d9\u05dd", "levanti/english": "The nets were full of fish.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u200f\u0641\u064a\u0647 \u0645\u0631\u0627\u0643\u0632 \u0644\u0644\u062a\u062f\u0631\u064a\u0628 \u0627\u0644\u0645\u0647\u0646\u064a \u200e", "levanti/hebrew": "\u05d9\u05e9\u05e0\u05dd \u05de\u05e8\u05db\u05d6\u05d9\u05dd \u05dc\u05d4\u05db\u05e9\u05e8\u05d4 \u05de\u05e7\u05e6\u05d5\u05e2\u05d9\u05ea", "levanti/english": "There are centers for vocational training.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0627\u0642\u0639\u062f\u064a \u0648\u0627\u0633\u0643\u062a\u064a! - \u0633\u062f\u064a \u0628\u0648\u0632\u0643.", "levanti/hebrew": "\u05ea\u05e1\u05ea\u05de\u05d9 \u05d0\u05ea \u05d4\u05e4\u05d4.", "levanti/english": "Sit down and shut up! - Shut your mouth.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0645\u0646\u0630 \u0632\u0645\u0646 \u0637\u0648\u064a\u0644 \u064a\u0628\u062d\u062b \u0627\u0644\u064a\u0647\u0648\u062f \u0627\u0644\u0645\u062a\u062f\u064a\u0651\u0646\u0648\u0646 \u0639\u0646 \u0627\u0644\u0628\u0642\u0631\u0629 \u0627\u0644\u062d\u0645\u0631\u0627\u0621", "levanti/hebrew": "\u05d6\u05d4 \u05d6\u05de\u05df \u05e8\u05d1 \u05de\u05d7\u05e4\u05e9\u05d9\u05dd \u05d4\u05d9\u05d4\u05d5\u05d3\u05d9\u05dd \u05d4\u05d3\u05ea\u05d9\u05d9\u05dd \u05d0\u05ea \u05d4\u05e4\u05e8\u05d4 \u05d4\u05d0\u05d3\u05d5\u05de\u05d4.", "levanti/english": "For a long time, religious Jews have been searching for the red heifer.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0623\u0648\u0651\u0644 \u0645\u0648\u0636\u0648\u0639 \u0628\u0627\u062e\u062a\u0635\u0627\u0631 \u0643\u0628\u064a\u0631\u060c", "levanti/hebrew": "\u05d4\u05e0\u05d5\u05e9\u05d0 \u05d4\u05e8\u05d0\u05e9\u05d5\u05df, \u05d1\u05e7\u05d9\u05e6\u05d5\u05e8 \u05e0\u05de\u05e8\u05e5,", "levanti/english": "The first topic, in brief,", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Jordanian", "levanti/arabic": "\u064a\u0627 \u0648\u0644\u0627\u062f \u064a\u0644\u0627 \u062a\u0623\u062e\u0631\u0646\u0627\u060c \u0634\u0648 \u0628\u062a\u0633\u0627\u0648\u0627\u061f", "levanti/hebrew": "\u05d9\u05dc\u05d3\u05d9\u05dd! \u05d1\u05d5\u05d0\u05d5, \u05d0\u05e0\u05d7\u05e0\u05d5 \u05de\u05d0\u05d7\u05e8\u05d9\u05dd! \u05de\u05d4 \u05d0\u05ea\u05dd \u05e2\u05d5\u05e9\u05d9\u05dd?", "levanti/english": "Kids, let's go, we're late! What are you doing?", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0627\u0646\u0627 \u062d\u0627\u0628\u0628 \u0627\u062d\u0643\u064a \u0645\u0639\u0643.", "levanti/hebrew": "\u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6\u05d4 \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da.", "levanti/english": "I want to talk to you.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0645\u0631\u062d\u0628\u0627 \u0623\u0631\u064a\u062c. -\u0645\u0631\u062d\u0628\u0627 \u062c\u0644\u0627\u0644.", "levanti/hebrew": "\u05e9\u05dc\u05d5\u05dd, \u05d0\u05e8\u05d9\u05d2'. -\u05e9\u05dc\u05d5\u05dd, \u05d2'\u05dc\u05d0\u05dc.", "levanti/english": "Hello, Areej. -Hello, Jalal.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0648\u0625\u0630\u0627 \u0645\u0634 \u0646\u0633\u0627\u064a\u0628\u0646\u0627 \u0648\u0644\u0627\u062f \u0639\u0645\u0651\u0646\u0627!", "levanti/hebrew": "\u05d5\u05d0\u05dd \u05dc\u05d0 \u05e7\u05e8\u05d5\u05d1\u05d9\u05dd \u05e9\u05dc\u05e0\u05d5 \u05d4\u05dd \u05d1\u05e0\u05d9 \u05d3\u05d5\u05d3\u05d9\u05e0\u05d5!", "levanti/english": "And if they are not our relatives, they are our cousins!", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Jordanian", "levanti/arabic": "\u0647\u0645 \u0645\u0634 \u0639\u0627\u062c\u0628\u0647\u0645 \u0625\u0646\u0648 \u0645\u062f\u064a\u0631\u062a\u0647\u0645 \u0633\u062a", "levanti/hebrew": "\u05d4\u05dd \u05dc\u05d0 \u05de\u05e8\u05d5\u05e6\u05d9\u05dd \u05de\u05db\u05da \u05e9\u05d4\u05de\u05e0\u05d4\u05dc\u05ea \u05e9\u05dc\u05d4\u05dd \u05d0\u05d9\u05e9\u05d4", "levanti/english": "They don't like that their manager is a woman.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Syrian", "levanti/arabic": "\u200f\u0627\u0644\u0634\u0628\u0643\u0627\u062a \u0643\u0627\u0646\u062a \u0645\u0644\u0627\u0646\u0629 \u0633\u0645\u0643 \u200e", "levanti/hebrew": "\u05d4\u05e8\u05e9\u05ea\u05d5\u05ea \u05d4\u05d9\u05d5 \u05de\u05dc\u05d0\u05d5\u05ea \u05d3\u05d2\u05d9\u05dd", "levanti/english": "The nets were full of fish.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u200f\u0641\u064a\u0647 \u0645\u0631\u0627\u0643\u0632 \u0644\u0644\u062a\u062f\u0631\u064a\u0628 \u0627\u0644\u0645\u0647\u0646\u064a \u200e", "levanti/hebrew": "\u05d9\u05e9\u05e0\u05dd \u05de\u05e8\u05db\u05d6\u05d9\u05dd \u05dc\u05d4\u05db\u05e9\u05e8\u05d4 \u05de\u05e7\u05e6\u05d5\u05e2\u05d9\u05ea", "levanti/english": "There are centers for vocational training.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0627\u0642\u0639\u062f\u064a \u0648\u0627\u0633\u0643\u062a\u064a! - \u0633\u062f\u064a \u0628\u0648\u0632\u0643.", "levanti/hebrew": "\u05ea\u05e1\u05ea\u05de\u05d9 \u05d0\u05ea \u05d4\u05e4\u05d4.", "levanti/english": "Sit down and shut up! - Shut your mouth.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0645\u0646\u0630 \u0632\u0645\u0646 \u0637\u0648\u064a\u0644 \u064a\u0628\u062d\u062b \u0627\u0644\u064a\u0647\u0648\u062f \u0627\u0644\u0645\u062a\u062f\u064a\u0651\u0646\u0648\u0646 \u0639\u0646 \u0627\u0644\u0628\u0642\u0631\u0629 \u0627\u0644\u062d\u0645\u0631\u0627\u0621", "levanti/hebrew": "\u05d6\u05d4 \u05d6\u05de\u05df \u05e8\u05d1 \u05de\u05d7\u05e4\u05e9\u05d9\u05dd \u05d4\u05d9\u05d4\u05d5\u05d3\u05d9\u05dd \u05d4\u05d3\u05ea\u05d9\u05d9\u05dd \u05d0\u05ea \u05d4\u05e4\u05e8\u05d4 \u05d4\u05d0\u05d3\u05d5\u05de\u05d4.", "levanti/english": "For a long time, religious Jews have been searching for the red heifer.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0623\u0648\u0651\u0644 \u0645\u0648\u0636\u0648\u0639 \u0628\u0627\u062e\u062a\u0635\u0627\u0631 \u0643\u0628\u064a\u0631\u060c", "levanti/hebrew": "\u05d4\u05e0\u05d5\u05e9\u05d0 \u05d4\u05e8\u05d0\u05e9\u05d5\u05df, \u05d1\u05e7\u05d9\u05e6\u05d5\u05e8 \u05e0\u05de\u05e8\u05e5,", "levanti/english": "The first topic, in brief,", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Jordanian", "levanti/arabic": "\u064a\u0627 \u0648\u0644\u0627\u062f \u064a\u0644\u0627 \u062a\u0623\u062e\u0631\u0646\u0627\u060c \u0634\u0648 \u0628\u062a\u0633\u0627\u0648\u0627\u061f", "levanti/hebrew": "\u05d9\u05dc\u05d3\u05d9\u05dd! \u05d1\u05d5\u05d0\u05d5, \u05d0\u05e0\u05d7\u05e0\u05d5 \u05de\u05d0\u05d7\u05e8\u05d9\u05dd! \u05de\u05d4 \u05d0\u05ea\u05dd \u05e2\u05d5\u05e9\u05d9\u05dd?", "levanti/english": "Kids, let's go, we're late! What are you doing?", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} diff --git a/datasets/1.0/huggingface-open-hermes/metadata.json b/datasets/1.0/huggingface-open-hermes/metadata.json new file mode 100644 index 000000000..2f4f4b0fc --- /dev/null +++ b/datasets/1.0/huggingface-open-hermes/metadata.json @@ -0,0 +1,323 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataBiases": "cr:dataBiases", + "dataCollection": "cr:dataCollection", + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "personalSensitiveInformation": "cr:personalSensitiveInformation", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" + }, + "@type": "sc:Dataset", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "repo", + "name": "repo", + "description": "The Hugging Face git repository.", + "contentUrl": "https://huggingface.co/datasets/teknium/OpenHermes-2.5/tree/refs%2Fconvert%2Fparquet", + "encodingFormat": "git+https", + "sha256": "https://github.com/mlcommons/croissant/issues/80" + }, + { + "@type": "cr:FileSet", + "@id": "parquet-files-for-config-default", + "name": "parquet-files-for-config-default", + "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).", + "containedIn": { + "@id": "repo" + }, + "encodingFormat": "application/x-parquet", + "includes": "default/*/*.parquet" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "default", + "name": "default", + "description": "teknium/OpenHermes-2.5 - 'default' subset\n\nAdditional information:\n- 1 skipped column: conversations", + "field": [ + { + "@type": "cr:Field", + "@id": "default/custom_instruction", + "name": "default/custom_instruction", + "description": "Column 'custom_instruction' from the Hugging Face parquet file.", + "dataType": "sc:Boolean", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "custom_instruction" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/topic", + "name": "default/topic", + "description": "Column 'topic' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "topic" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/model_name", + "name": "default/model_name", + "description": "Column 'model_name' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "model_name" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/model", + "name": "default/model", + "description": "Column 'model' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "model" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/skip_prompt_formatting", + "name": "default/skip_prompt_formatting", + "description": "Column 'skip_prompt_formatting' from the Hugging Face parquet file.", + "dataType": "sc:Boolean", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "skip_prompt_formatting" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/category", + "name": "default/category", + "description": "Column 'category' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "category" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/views", + "name": "default/views", + "description": "Column 'views' from the Hugging Face parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "views" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/language", + "name": "default/language", + "description": "Column 'language' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "language" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/id", + "name": "default/id", + "description": "Column 'id' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "id" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/title", + "name": "default/title", + "description": "Column 'title' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "title" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/idx", + "name": "default/idx", + "description": "Column 'idx' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "idx" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/hash", + "name": "default/hash", + "description": "Column 'hash' from the Hugging Face parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "hash" + } + }, + "repeated": true + }, + { + "@type": "cr:Field", + "@id": "default/avatarUrl", + "name": "default/avatarUrl", + "description": "Column 'avatarUrl' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "avatarUrl" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/system_prompt", + "name": "default/system_prompt", + "description": "Column 'system_prompt' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "system_prompt" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/source", + "name": "default/source", + "description": "Column 'source' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "source" + } + } + } + ] + } + ], + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "OpenHermes-2.5", + "description": "\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for Dataset Name\n\t\n\nThis is the dataset that made OpenHermes 2.5 and Nous Hermes 2 series of models.\nSupport me on GitHub sponsors \u003C3 : https://github.com/sponsors/teknium1\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Details\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Description\n\t\n\nThe Open Hermes 2/2.5 and Nous Hermes 2 models have made significant advancements of SOTA LLM's over recent months, and are underpinned by this exact compilation and curation of many open source datasets and custom created synthetic… See the full description on the dataset page: https://huggingface.co/datasets/teknium/OpenHermes-2.5.", + "keywords": [ + "English", + "1M - 10M", + "json", + "Text", + "Datasets", + "pandas", + "Croissant", + "Polars", + "🇺🇸 Region: US", + "Synthetic", + "GPT-4", + "Distillation", + "Compilation" + ], + "url": "https://huggingface.co/datasets/teknium/OpenHermes-2.5" +} diff --git a/datasets/1.0/huggingface-open-hermes/output/default.jsonl b/datasets/1.0/huggingface-open-hermes/output/default.jsonl new file mode 100644 index 000000000..9e7724c86 --- /dev/null +++ b/datasets/1.0/huggingface-open-hermes/output/default.jsonl @@ -0,0 +1,3 @@ +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "multiple_choice", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} diff --git a/datasets/1.0/pass-mini/output/images.jsonl b/datasets/1.0/pass-mini/output/images.jsonl index 588e01917..377f2cd99 100644 --- a/datasets/1.0/pass-mini/output/images.jsonl +++ b/datasets/1.0/pass-mini/output/images.jsonl @@ -1,8 +1,8 @@ -{"images/hash": "75f7305b1fd94044e14bdcdde469dbb2", "images/image_content": ">", "images/creator_uname": "PaperBird+Photography%3C3", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2007-05-06 06:11:48"} +{"images/hash": "75f7305b1fd94044e14bdcdde469dbb2", "images/image_content": ">", "images/creator_uname": "PaperBird+Photography%3C3", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2007-05-06 06:11:48"} {"images/hash": "dd571a41a015354d92a859f7ef31201", "images/image_content": ">", "images/creator_uname": "Chiara+Marra", "images/gps_coordinates": {"images/latitude": 38.23818, "images/longitude": 13.183593}, "images/date_taken": "2007-05-04 15:46:43"} -{"images/hash": "598ad3bc7e6e876e61af116693c7ad9", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2006-05-01 07:34:13"} -{"images/hash": "e48d6d552465c5728585b82a53d6e02c", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2006-04-23 19:20:40"} -{"images/hash": "ffd3eb12a16cb83138f26e6f36dec967", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-17 00:44:29"} +{"images/hash": "598ad3bc7e6e876e61af116693c7ad9", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2006-05-01 07:34:13"} +{"images/hash": "e48d6d552465c5728585b82a53d6e02c", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2006-04-23 19:20:40"} +{"images/hash": "ffd3eb12a16cb83138f26e6f36dec967", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-17 00:44:29"} {"images/hash": "fff0eece99cc71c2e91fe716051599", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": 53.535233, "images/longitude": -113.565075}, "images/date_taken": "2004-05-11 02:00:33"} -{"images/hash": "fedefe9f11bf2a749a749bfca8bf28", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-27 10:34:28"} -{"images/hash": "ff379727f52bcec4dfb237ace41627", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-29 02:14:36"} +{"images/hash": "fedefe9f11bf2a749a749bfca8bf28", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-27 10:34:28"} +{"images/hash": "ff379727f52bcec4dfb237ace41627", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-29 02:14:36"} diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py index 79fc1beef..89f52e41b 100644 --- a/python/mlcroissant/mlcroissant/_src/datasets_test.py +++ b/python/mlcroissant/mlcroissant/_src/datasets_test.py @@ -247,6 +247,7 @@ def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records ["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None], ["huggingface-c4/metadata.json", "data", 1, {"data/variant": "en"}], ["huggingface-levanti/metadata.json", "levanti_train", 10, None], + ["huggingface-open-hermes/metadata.json", "default", 3, None], ], ) def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters): diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index d84a6f65b..5abea2c27 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -38,6 +38,8 @@ def _is_repeated_field(field: Field | None) -> bool | None: def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: """Applies one transform to `value`.""" + if _is_na(value): + return value if transform.regex is not None: source_regex = re.compile(transform.regex) if isinstance(value, pathlib.PurePath): @@ -76,10 +78,13 @@ def apply_transforms_fn(value: Any, field: Field, repeated: bool = False) -> Any return value +def _is_na(value: Any) -> bool: + return not isinstance(value, (list, np.ndarray)) and pd.isna(value) + + def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None): """Casts the value `value` to the desired target data type `data_type`.""" - is_na = not isinstance(value, (list, np.ndarray)) and pd.isna(value) - if is_na: + if _is_na(value): return value elif data_type == DataType.IMAGE_OBJECT: if isinstance(value, deps.PIL_Image.Image): @@ -145,6 +150,31 @@ def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame: return df +def _populate_repeated_nested_subfield( + value: Any, field: Field, result: dict[str, Any] +) -> dict[str, Any]: + """Populates result with a field's nested subfields.""" + if not field.parent: + raise ValueError( + "Nested subfields can only be populated when the parent field exists!" + ) + parent_id = field.parent.id + existing_values = result.get(parent_id, None) + if existing_values: + if not _is_na(value) and len(value) != len(existing_values): + raise ValueError( + f"Lenghts of {field.id} doesn't match already stored items for " + f" {parent_id}" + ) + for i in range(len(existing_values)): + existing_values[i][field.id] = None if _is_na(value) else value[i] + else: + result[parent_id] = ( + [{field.id: None}] if _is_na(value) else [{field.id: v} for v in value] + ) + return result + + @dataclasses.dataclass(frozen=True, repr=False) class ReadFields(Operation): """Reads fields in a RecordSet from a Pandas DataFrame and applies transformations. @@ -187,17 +217,21 @@ def _get_result(row): f'Column "{column}" does not exist. Inspect the ancestors of the' f" field {field} to understand why. Possible fields: {df.columns}" ) - value = row[column] is_repeated = field.repeated or ( field.parent and _is_repeated_field(field.parent) ) - value = apply_transforms_fn(value, field=field, repeated=is_repeated) - if is_repeated: + value = apply_transforms_fn( + row[column], field=field, repeated=is_repeated + ) + if _is_na(value): + value = None + elif is_repeated: value = [ _cast_value(self.node.ctx, v, field.data_type) for v in value ] else: value = _cast_value(self.node.ctx, value, field.data_type) + if self.node.ctx.is_v0(): result[field.name] = value else: @@ -207,19 +241,9 @@ def _get_result(row): # Repeated nested sub-fields render as a list of dictionaries. if field.parent: if _is_repeated_field(field.parent): - if field.parent.id not in result: - result[field.parent.id] = [ - {field.id: v} for v in value - ] - else: - if len(value) != len(result[field.parent.id]): - raise ValueError( - f"Lenghts of {field.id} doesn't match" - " already stored items for" - f" {field.parent.id}" - ) - for i, v in enumerate(value): - result[field.parent.id][i][field.id] = v + result = _populate_repeated_nested_subfield( + value=value, field=field, result=result + ) # Non-repeated subfields render as a single dictionary. else: if field.parent.id not in result: diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py index 9d76430c9..eff8d36c7 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py @@ -83,6 +83,7 @@ def test_readfield_with_subfields(): f.write("latitude,longitude,names,surnames\n") f.write("1,1,Anna-Maria,Rossi-Bianchi\n") f.write("2,2,Giulia,Ferrari\n") + f.write("1,3,,\n") # Nodes to define metadata. distribution = [ FileObject( @@ -195,6 +196,18 @@ def test_readfield_with_subfields(): }, ], }, + { + "main/coordinates": { + "main/coordinates/latitude": 1, + "main/coordinates/longitude": 3, + }, + "main/checked_users": [ + { + "main/checked_users/name": None, + "main/checked_users/surname": None, + }, + ], + }, ] result = list(read_field.call(df)) assert result == expected