diff --git a/datasets/0.8/pass-mini/output/images.jsonl b/datasets/0.8/pass-mini/output/images.jsonl index 3c76c74a8..dcf704f20 100644 --- a/datasets/0.8/pass-mini/output/images.jsonl +++ b/datasets/0.8/pass-mini/output/images.jsonl @@ -1,8 +1,8 @@ -{"creator_uname": "PaperBird+Photography%3C3", "latitude": null, "longitude": null, "date_taken": "2007-05-06 06:11:48", "hash": "75f7305b1fd94044e14bdcdde469dbb2", "image_content": ">"} +{"creator_uname": "PaperBird+Photography%3C3", "latitude": "None", "longitude": "None", "date_taken": "2007-05-06 06:11:48", "hash": "75f7305b1fd94044e14bdcdde469dbb2", "image_content": ">"} {"creator_uname": "Chiara+Marra", "latitude": 38.23818, "longitude": 13.183593, "date_taken": "2007-05-04 15:46:43", "hash": "dd571a41a015354d92a859f7ef31201", "image_content": ">"} -{"creator_uname": "maplesbranch", "latitude": null, "longitude": null, "date_taken": "2006-05-01 07:34:13", "hash": "598ad3bc7e6e876e61af116693c7ad9", "image_content": ">"} -{"creator_uname": "maplesbranch", "latitude": null, "longitude": null, "date_taken": "2006-04-23 19:20:40", "hash": "e48d6d552465c5728585b82a53d6e02c", "image_content": ">"} -{"creator_uname": "quinnums", "latitude": null, "longitude": null, "date_taken": "2004-05-17 00:44:29", "hash": "ffd3eb12a16cb83138f26e6f36dec967", "image_content": ">"} +{"creator_uname": "maplesbranch", "latitude": "None", "longitude": "None", "date_taken": "2006-05-01 07:34:13", "hash": "598ad3bc7e6e876e61af116693c7ad9", "image_content": ">"} +{"creator_uname": "maplesbranch", "latitude": "None", "longitude": "None", "date_taken": "2006-04-23 19:20:40", "hash": "e48d6d552465c5728585b82a53d6e02c", "image_content": ">"} +{"creator_uname": "quinnums", "latitude": "None", "longitude": "None", "date_taken": "2004-05-17 00:44:29", "hash": "ffd3eb12a16cb83138f26e6f36dec967", "image_content": ">"} {"creator_uname": "striatic", "latitude": 53.535233, "longitude": -113.565075, "date_taken": "2004-05-11 02:00:33", "hash": "fff0eece99cc71c2e91fe716051599", "image_content": ">"} -{"creator_uname": "striatic", "latitude": null, "longitude": null, "date_taken": "2004-05-27 10:34:28", "hash": "fedefe9f11bf2a749a749bfca8bf28", "image_content": ">"} -{"creator_uname": "quinnums", "latitude": null, "longitude": null, "date_taken": "2004-05-29 02:14:36", "hash": "ff379727f52bcec4dfb237ace41627", "image_content": ">"} +{"creator_uname": "striatic", "latitude": "None", "longitude": "None", "date_taken": "2004-05-27 10:34:28", "hash": "fedefe9f11bf2a749a749bfca8bf28", "image_content": ">"} +{"creator_uname": "quinnums", "latitude": "None", "longitude": "None", "date_taken": "2004-05-29 02:14:36", "hash": "ff379727f52bcec4dfb237ace41627", "image_content": ">"} diff --git a/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl b/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl index e6e859717..6dcef5645 100644 --- a/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl +++ b/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl @@ -1,10 +1,10 @@ -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0627\u0646\u0627 \u062d\u0627\u0628\u0628 \u0627\u062d\u0643\u064a \u0645\u0639\u0643.", "levanti/hebrew": "\u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6\u05d4 \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da.", "levanti/english": "I want to talk to you.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0645\u0631\u062d\u0628\u0627 \u0623\u0631\u064a\u062c. -\u0645\u0631\u062d\u0628\u0627 \u062c\u0644\u0627\u0644.", "levanti/hebrew": "\u05e9\u05dc\u05d5\u05dd, \u05d0\u05e8\u05d9\u05d2'. -\u05e9\u05dc\u05d5\u05dd, \u05d2'\u05dc\u05d0\u05dc.", "levanti/english": "Hello, Areej. -Hello, Jalal.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0648\u0625\u0630\u0627 \u0645\u0634 \u0646\u0633\u0627\u064a\u0628\u0646\u0627 \u0648\u0644\u0627\u062f \u0639\u0645\u0651\u0646\u0627!", "levanti/hebrew": "\u05d5\u05d0\u05dd \u05dc\u05d0 \u05e7\u05e8\u05d5\u05d1\u05d9\u05dd \u05e9\u05dc\u05e0\u05d5 \u05d4\u05dd \u05d1\u05e0\u05d9 \u05d3\u05d5\u05d3\u05d9\u05e0\u05d5!", "levanti/english": "And if they are not our relatives, they are our cousins!", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Jordanian", "levanti/arabic": "\u0647\u0645 \u0645\u0634 \u0639\u0627\u062c\u0628\u0647\u0645 \u0625\u0646\u0648 \u0645\u062f\u064a\u0631\u062a\u0647\u0645 \u0633\u062a", "levanti/hebrew": "\u05d4\u05dd \u05dc\u05d0 \u05de\u05e8\u05d5\u05e6\u05d9\u05dd \u05de\u05db\u05da \u05e9\u05d4\u05de\u05e0\u05d4\u05dc\u05ea \u05e9\u05dc\u05d4\u05dd \u05d0\u05d9\u05e9\u05d4", "levanti/english": "They don't like that their manager is a woman.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Syrian", "levanti/arabic": "\u200f\u0627\u0644\u0634\u0628\u0643\u0627\u062a \u0643\u0627\u0646\u062a \u0645\u0644\u0627\u0646\u0629 \u0633\u0645\u0643 \u200e", "levanti/hebrew": "\u05d4\u05e8\u05e9\u05ea\u05d5\u05ea \u05d4\u05d9\u05d5 \u05de\u05dc\u05d0\u05d5\u05ea \u05d3\u05d2\u05d9\u05dd", "levanti/english": "The nets were full of fish.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u200f\u0641\u064a\u0647 \u0645\u0631\u0627\u0643\u0632 \u0644\u0644\u062a\u062f\u0631\u064a\u0628 \u0627\u0644\u0645\u0647\u0646\u064a \u200e", "levanti/hebrew": "\u05d9\u05e9\u05e0\u05dd \u05de\u05e8\u05db\u05d6\u05d9\u05dd \u05dc\u05d4\u05db\u05e9\u05e8\u05d4 \u05de\u05e7\u05e6\u05d5\u05e2\u05d9\u05ea", "levanti/english": "There are centers for vocational training.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0627\u0642\u0639\u062f\u064a \u0648\u0627\u0633\u0643\u062a\u064a! - \u0633\u062f\u064a \u0628\u0648\u0632\u0643.", "levanti/hebrew": "\u05ea\u05e1\u05ea\u05de\u05d9 \u05d0\u05ea \u05d4\u05e4\u05d4.", "levanti/english": "Sit down and shut up! - Shut your mouth.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0645\u0646\u0630 \u0632\u0645\u0646 \u0637\u0648\u064a\u0644 \u064a\u0628\u062d\u062b \u0627\u0644\u064a\u0647\u0648\u062f \u0627\u0644\u0645\u062a\u062f\u064a\u0651\u0646\u0648\u0646 \u0639\u0646 \u0627\u0644\u0628\u0642\u0631\u0629 \u0627\u0644\u062d\u0645\u0631\u0627\u0621", "levanti/hebrew": "\u05d6\u05d4 \u05d6\u05de\u05df \u05e8\u05d1 \u05de\u05d7\u05e4\u05e9\u05d9\u05dd \u05d4\u05d9\u05d4\u05d5\u05d3\u05d9\u05dd \u05d4\u05d3\u05ea\u05d9\u05d9\u05dd \u05d0\u05ea \u05d4\u05e4\u05e8\u05d4 \u05d4\u05d0\u05d3\u05d5\u05de\u05d4.", "levanti/english": "For a long time, religious Jews have been searching for the red heifer.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0623\u0648\u0651\u0644 \u0645\u0648\u0636\u0648\u0639 \u0628\u0627\u062e\u062a\u0635\u0627\u0631 \u0643\u0628\u064a\u0631\u060c", "levanti/hebrew": "\u05d4\u05e0\u05d5\u05e9\u05d0 \u05d4\u05e8\u05d0\u05e9\u05d5\u05df, \u05d1\u05e7\u05d9\u05e6\u05d5\u05e8 \u05e0\u05de\u05e8\u05e5,", "levanti/english": "The first topic, in brief,", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Jordanian", "levanti/arabic": "\u064a\u0627 \u0648\u0644\u0627\u062f \u064a\u0644\u0627 \u062a\u0623\u062e\u0631\u0646\u0627\u060c \u0634\u0648 \u0628\u062a\u0633\u0627\u0648\u0627\u061f", "levanti/hebrew": "\u05d9\u05dc\u05d3\u05d9\u05dd! \u05d1\u05d5\u05d0\u05d5, \u05d0\u05e0\u05d7\u05e0\u05d5 \u05de\u05d0\u05d7\u05e8\u05d9\u05dd! \u05de\u05d4 \u05d0\u05ea\u05dd \u05e2\u05d5\u05e9\u05d9\u05dd?", "levanti/english": "Kids, let's go, we're late! What are you doing?", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0627\u0646\u0627 \u062d\u0627\u0628\u0628 \u0627\u062d\u0643\u064a \u0645\u0639\u0643.", "levanti/hebrew": "\u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6\u05d4 \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da.", "levanti/english": "I want to talk to you.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0645\u0631\u062d\u0628\u0627 \u0623\u0631\u064a\u062c. -\u0645\u0631\u062d\u0628\u0627 \u062c\u0644\u0627\u0644.", "levanti/hebrew": "\u05e9\u05dc\u05d5\u05dd, \u05d0\u05e8\u05d9\u05d2'. -\u05e9\u05dc\u05d5\u05dd, \u05d2'\u05dc\u05d0\u05dc.", "levanti/english": "Hello, Areej. -Hello, Jalal.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0648\u0625\u0630\u0627 \u0645\u0634 \u0646\u0633\u0627\u064a\u0628\u0646\u0627 \u0648\u0644\u0627\u062f \u0639\u0645\u0651\u0646\u0627!", "levanti/hebrew": "\u05d5\u05d0\u05dd \u05dc\u05d0 \u05e7\u05e8\u05d5\u05d1\u05d9\u05dd \u05e9\u05dc\u05e0\u05d5 \u05d4\u05dd \u05d1\u05e0\u05d9 \u05d3\u05d5\u05d3\u05d9\u05e0\u05d5!", "levanti/english": "And if they are not our relatives, they are our cousins!", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Jordanian", "levanti/arabic": "\u0647\u0645 \u0645\u0634 \u0639\u0627\u062c\u0628\u0647\u0645 \u0625\u0646\u0648 \u0645\u062f\u064a\u0631\u062a\u0647\u0645 \u0633\u062a", "levanti/hebrew": "\u05d4\u05dd \u05dc\u05d0 \u05de\u05e8\u05d5\u05e6\u05d9\u05dd \u05de\u05db\u05da \u05e9\u05d4\u05de\u05e0\u05d4\u05dc\u05ea \u05e9\u05dc\u05d4\u05dd \u05d0\u05d9\u05e9\u05d4", "levanti/english": "They don't like that their manager is a woman.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Syrian", "levanti/arabic": "\u200f\u0627\u0644\u0634\u0628\u0643\u0627\u062a \u0643\u0627\u0646\u062a \u0645\u0644\u0627\u0646\u0629 \u0633\u0645\u0643 \u200e", "levanti/hebrew": "\u05d4\u05e8\u05e9\u05ea\u05d5\u05ea \u05d4\u05d9\u05d5 \u05de\u05dc\u05d0\u05d5\u05ea \u05d3\u05d2\u05d9\u05dd", "levanti/english": "The nets were full of fish.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u200f\u0641\u064a\u0647 \u0645\u0631\u0627\u0643\u0632 \u0644\u0644\u062a\u062f\u0631\u064a\u0628 \u0627\u0644\u0645\u0647\u0646\u064a \u200e", "levanti/hebrew": "\u05d9\u05e9\u05e0\u05dd \u05de\u05e8\u05db\u05d6\u05d9\u05dd \u05dc\u05d4\u05db\u05e9\u05e8\u05d4 \u05de\u05e7\u05e6\u05d5\u05e2\u05d9\u05ea", "levanti/english": "There are centers for vocational training.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0627\u0642\u0639\u062f\u064a \u0648\u0627\u0633\u0643\u062a\u064a! - \u0633\u062f\u064a \u0628\u0648\u0632\u0643.", "levanti/hebrew": "\u05ea\u05e1\u05ea\u05de\u05d9 \u05d0\u05ea \u05d4\u05e4\u05d4.", "levanti/english": "Sit down and shut up! - Shut your mouth.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0645\u0646\u0630 \u0632\u0645\u0646 \u0637\u0648\u064a\u0644 \u064a\u0628\u062d\u062b \u0627\u0644\u064a\u0647\u0648\u062f \u0627\u0644\u0645\u062a\u062f\u064a\u0651\u0646\u0648\u0646 \u0639\u0646 \u0627\u0644\u0628\u0642\u0631\u0629 \u0627\u0644\u062d\u0645\u0631\u0627\u0621", "levanti/hebrew": "\u05d6\u05d4 \u05d6\u05de\u05df \u05e8\u05d1 \u05de\u05d7\u05e4\u05e9\u05d9\u05dd \u05d4\u05d9\u05d4\u05d5\u05d3\u05d9\u05dd \u05d4\u05d3\u05ea\u05d9\u05d9\u05dd \u05d0\u05ea \u05d4\u05e4\u05e8\u05d4 \u05d4\u05d0\u05d3\u05d5\u05de\u05d4.", "levanti/english": "For a long time, religious Jews have been searching for the red heifer.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0623\u0648\u0651\u0644 \u0645\u0648\u0636\u0648\u0639 \u0628\u0627\u062e\u062a\u0635\u0627\u0631 \u0643\u0628\u064a\u0631\u060c", "levanti/hebrew": "\u05d4\u05e0\u05d5\u05e9\u05d0 \u05d4\u05e8\u05d0\u05e9\u05d5\u05df, \u05d1\u05e7\u05d9\u05e6\u05d5\u05e8 \u05e0\u05de\u05e8\u05e5,", "levanti/english": "The first topic, in brief,", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Jordanian", "levanti/arabic": "\u064a\u0627 \u0648\u0644\u0627\u062f \u064a\u0644\u0627 \u062a\u0623\u062e\u0631\u0646\u0627\u060c \u0634\u0648 \u0628\u062a\u0633\u0627\u0648\u0627\u061f", "levanti/hebrew": "\u05d9\u05dc\u05d3\u05d9\u05dd! \u05d1\u05d5\u05d0\u05d5, \u05d0\u05e0\u05d7\u05e0\u05d5 \u05de\u05d0\u05d7\u05e8\u05d9\u05dd! \u05de\u05d4 \u05d0\u05ea\u05dd \u05e2\u05d5\u05e9\u05d9\u05dd?", "levanti/english": "Kids, let's go, we're late! What are you doing?", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} diff --git a/datasets/1.0/huggingface-open-hermes/metadata.json b/datasets/1.0/huggingface-open-hermes/metadata.json new file mode 100644 index 000000000..2f4f4b0fc --- /dev/null +++ b/datasets/1.0/huggingface-open-hermes/metadata.json @@ -0,0 +1,323 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataBiases": "cr:dataBiases", + "dataCollection": "cr:dataCollection", + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "personalSensitiveInformation": "cr:personalSensitiveInformation", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" + }, + "@type": "sc:Dataset", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "repo", + "name": "repo", + "description": "The Hugging Face git repository.", + "contentUrl": "https://huggingface.co/datasets/teknium/OpenHermes-2.5/tree/refs%2Fconvert%2Fparquet", + "encodingFormat": "git+https", + "sha256": "https://github.com/mlcommons/croissant/issues/80" + }, + { + "@type": "cr:FileSet", + "@id": "parquet-files-for-config-default", + "name": "parquet-files-for-config-default", + "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).", + "containedIn": { + "@id": "repo" + }, + "encodingFormat": "application/x-parquet", + "includes": "default/*/*.parquet" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "default", + "name": "default", + "description": "teknium/OpenHermes-2.5 - 'default' subset\n\nAdditional information:\n- 1 skipped column: conversations", + "field": [ + { + "@type": "cr:Field", + "@id": "default/custom_instruction", + "name": "default/custom_instruction", + "description": "Column 'custom_instruction' from the Hugging Face parquet file.", + "dataType": "sc:Boolean", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "custom_instruction" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/topic", + "name": "default/topic", + "description": "Column 'topic' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "topic" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/model_name", + "name": "default/model_name", + "description": "Column 'model_name' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "model_name" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/model", + "name": "default/model", + "description": "Column 'model' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "model" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/skip_prompt_formatting", + "name": "default/skip_prompt_formatting", + "description": "Column 'skip_prompt_formatting' from the Hugging Face parquet file.", + "dataType": "sc:Boolean", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "skip_prompt_formatting" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/category", + "name": "default/category", + "description": "Column 'category' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "category" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/views", + "name": "default/views", + "description": "Column 'views' from the Hugging Face parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "views" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/language", + "name": "default/language", + "description": "Column 'language' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "language" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/id", + "name": "default/id", + "description": "Column 'id' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "id" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/title", + "name": "default/title", + "description": "Column 'title' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "title" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/idx", + "name": "default/idx", + "description": "Column 'idx' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "idx" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/hash", + "name": "default/hash", + "description": "Column 'hash' from the Hugging Face parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "hash" + } + }, + "repeated": true + }, + { + "@type": "cr:Field", + "@id": "default/avatarUrl", + "name": "default/avatarUrl", + "description": "Column 'avatarUrl' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "avatarUrl" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/system_prompt", + "name": "default/system_prompt", + "description": "Column 'system_prompt' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "system_prompt" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/source", + "name": "default/source", + "description": "Column 'source' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "source" + } + } + } + ] + } + ], + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "OpenHermes-2.5", + "description": "\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for Dataset Name\n\t\n\nThis is the dataset that made OpenHermes 2.5 and Nous Hermes 2 series of models.\nSupport me on GitHub sponsors \u003C3 : https://github.com/sponsors/teknium1\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Details\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Description\n\t\n\nThe Open Hermes 2/2.5 and Nous Hermes 2 models have made significant advancements of SOTA LLM's over recent months, and are underpinned by this exact compilation and curation of many open source datasets and custom created synthetic… See the full description on the dataset page: https://huggingface.co/datasets/teknium/OpenHermes-2.5.", + "keywords": [ + "English", + "1M - 10M", + "json", + "Text", + "Datasets", + "pandas", + "Croissant", + "Polars", + "🇺🇸 Region: US", + "Synthetic", + "GPT-4", + "Distillation", + "Compilation" + ], + "url": "https://huggingface.co/datasets/teknium/OpenHermes-2.5" +} diff --git a/datasets/1.0/huggingface-open-hermes/output/default.jsonl b/datasets/1.0/huggingface-open-hermes/output/default.jsonl new file mode 100644 index 000000000..9e7724c86 --- /dev/null +++ b/datasets/1.0/huggingface-open-hermes/output/default.jsonl @@ -0,0 +1,3 @@ +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "multiple_choice", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} diff --git a/datasets/1.0/pass-mini/output/images.jsonl b/datasets/1.0/pass-mini/output/images.jsonl index 588e01917..377f2cd99 100644 --- a/datasets/1.0/pass-mini/output/images.jsonl +++ b/datasets/1.0/pass-mini/output/images.jsonl @@ -1,8 +1,8 @@ -{"images/hash": "75f7305b1fd94044e14bdcdde469dbb2", "images/image_content": ">", "images/creator_uname": "PaperBird+Photography%3C3", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2007-05-06 06:11:48"} +{"images/hash": "75f7305b1fd94044e14bdcdde469dbb2", "images/image_content": ">", "images/creator_uname": "PaperBird+Photography%3C3", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2007-05-06 06:11:48"} {"images/hash": "dd571a41a015354d92a859f7ef31201", "images/image_content": ">", "images/creator_uname": "Chiara+Marra", "images/gps_coordinates": {"images/latitude": 38.23818, "images/longitude": 13.183593}, "images/date_taken": "2007-05-04 15:46:43"} -{"images/hash": "598ad3bc7e6e876e61af116693c7ad9", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2006-05-01 07:34:13"} -{"images/hash": "e48d6d552465c5728585b82a53d6e02c", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2006-04-23 19:20:40"} -{"images/hash": "ffd3eb12a16cb83138f26e6f36dec967", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-17 00:44:29"} +{"images/hash": "598ad3bc7e6e876e61af116693c7ad9", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2006-05-01 07:34:13"} +{"images/hash": "e48d6d552465c5728585b82a53d6e02c", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2006-04-23 19:20:40"} +{"images/hash": "ffd3eb12a16cb83138f26e6f36dec967", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-17 00:44:29"} {"images/hash": "fff0eece99cc71c2e91fe716051599", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": 53.535233, "images/longitude": -113.565075}, "images/date_taken": "2004-05-11 02:00:33"} -{"images/hash": "fedefe9f11bf2a749a749bfca8bf28", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-27 10:34:28"} -{"images/hash": "ff379727f52bcec4dfb237ace41627", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-29 02:14:36"} +{"images/hash": "fedefe9f11bf2a749a749bfca8bf28", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-27 10:34:28"} +{"images/hash": "ff379727f52bcec4dfb237ace41627", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-29 02:14:36"} diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py index 79fc1beef..89f52e41b 100644 --- a/python/mlcroissant/mlcroissant/_src/datasets_test.py +++ b/python/mlcroissant/mlcroissant/_src/datasets_test.py @@ -247,6 +247,7 @@ def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records ["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None], ["huggingface-c4/metadata.json", "data", 1, {"data/variant": "en"}], ["huggingface-levanti/metadata.json", "levanti_train", 10, None], + ["huggingface-open-hermes/metadata.json", "default", 3, None], ], ) def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters): diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index d84a6f65b..5abea2c27 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -38,6 +38,8 @@ def _is_repeated_field(field: Field | None) -> bool | None: def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: """Applies one transform to `value`.""" + if _is_na(value): + return value if transform.regex is not None: source_regex = re.compile(transform.regex) if isinstance(value, pathlib.PurePath): @@ -76,10 +78,13 @@ def apply_transforms_fn(value: Any, field: Field, repeated: bool = False) -> Any return value +def _is_na(value: Any) -> bool: + return not isinstance(value, (list, np.ndarray)) and pd.isna(value) + + def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None): """Casts the value `value` to the desired target data type `data_type`.""" - is_na = not isinstance(value, (list, np.ndarray)) and pd.isna(value) - if is_na: + if _is_na(value): return value elif data_type == DataType.IMAGE_OBJECT: if isinstance(value, deps.PIL_Image.Image): @@ -145,6 +150,31 @@ def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame: return df +def _populate_repeated_nested_subfield( + value: Any, field: Field, result: dict[str, Any] +) -> dict[str, Any]: + """Populates result with a field's nested subfields.""" + if not field.parent: + raise ValueError( + "Nested subfields can only be populated when the parent field exists!" + ) + parent_id = field.parent.id + existing_values = result.get(parent_id, None) + if existing_values: + if not _is_na(value) and len(value) != len(existing_values): + raise ValueError( + f"Lenghts of {field.id} doesn't match already stored items for " + f" {parent_id}" + ) + for i in range(len(existing_values)): + existing_values[i][field.id] = None if _is_na(value) else value[i] + else: + result[parent_id] = ( + [{field.id: None}] if _is_na(value) else [{field.id: v} for v in value] + ) + return result + + @dataclasses.dataclass(frozen=True, repr=False) class ReadFields(Operation): """Reads fields in a RecordSet from a Pandas DataFrame and applies transformations. @@ -187,17 +217,21 @@ def _get_result(row): f'Column "{column}" does not exist. Inspect the ancestors of the' f" field {field} to understand why. Possible fields: {df.columns}" ) - value = row[column] is_repeated = field.repeated or ( field.parent and _is_repeated_field(field.parent) ) - value = apply_transforms_fn(value, field=field, repeated=is_repeated) - if is_repeated: + value = apply_transforms_fn( + row[column], field=field, repeated=is_repeated + ) + if _is_na(value): + value = None + elif is_repeated: value = [ _cast_value(self.node.ctx, v, field.data_type) for v in value ] else: value = _cast_value(self.node.ctx, value, field.data_type) + if self.node.ctx.is_v0(): result[field.name] = value else: @@ -207,19 +241,9 @@ def _get_result(row): # Repeated nested sub-fields render as a list of dictionaries. if field.parent: if _is_repeated_field(field.parent): - if field.parent.id not in result: - result[field.parent.id] = [ - {field.id: v} for v in value - ] - else: - if len(value) != len(result[field.parent.id]): - raise ValueError( - f"Lenghts of {field.id} doesn't match" - " already stored items for" - f" {field.parent.id}" - ) - for i, v in enumerate(value): - result[field.parent.id][i][field.id] = v + result = _populate_repeated_nested_subfield( + value=value, field=field, result=result + ) # Non-repeated subfields render as a single dictionary. else: if field.parent.id not in result: diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py index 9d76430c9..eff8d36c7 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py @@ -83,6 +83,7 @@ def test_readfield_with_subfields(): f.write("latitude,longitude,names,surnames\n") f.write("1,1,Anna-Maria,Rossi-Bianchi\n") f.write("2,2,Giulia,Ferrari\n") + f.write("1,3,,\n") # Nodes to define metadata. distribution = [ FileObject( @@ -195,6 +196,18 @@ def test_readfield_with_subfields(): }, ], }, + { + "main/coordinates": { + "main/coordinates/latitude": 1, + "main/coordinates/longitude": 3, + }, + "main/checked_users": [ + { + "main/checked_users/name": None, + "main/checked_users/surname": None, + }, + ], + }, ] result = list(read_field.call(df)) assert result == expected