diff --git a/datasets/0.8/pass-mini/output/images.jsonl b/datasets/0.8/pass-mini/output/images.jsonl index 3c76c74a8..dcf704f20 100644 --- a/datasets/0.8/pass-mini/output/images.jsonl +++ b/datasets/0.8/pass-mini/output/images.jsonl @@ -1,8 +1,8 @@ -{"creator_uname": "PaperBird+Photography%3C3", "latitude": null, "longitude": null, "date_taken": "2007-05-06 06:11:48", "hash": "75f7305b1fd94044e14bdcdde469dbb2", "image_content": ">"} +{"creator_uname": "PaperBird+Photography%3C3", "latitude": "None", "longitude": "None", "date_taken": "2007-05-06 06:11:48", "hash": "75f7305b1fd94044e14bdcdde469dbb2", "image_content": ">"} {"creator_uname": "Chiara+Marra", "latitude": 38.23818, "longitude": 13.183593, "date_taken": "2007-05-04 15:46:43", "hash": "dd571a41a015354d92a859f7ef31201", "image_content": ">"} -{"creator_uname": "maplesbranch", "latitude": null, "longitude": null, "date_taken": "2006-05-01 07:34:13", "hash": "598ad3bc7e6e876e61af116693c7ad9", "image_content": ">"} -{"creator_uname": "maplesbranch", "latitude": null, "longitude": null, "date_taken": "2006-04-23 19:20:40", "hash": "e48d6d552465c5728585b82a53d6e02c", "image_content": ">"} -{"creator_uname": "quinnums", "latitude": null, "longitude": null, "date_taken": "2004-05-17 00:44:29", "hash": "ffd3eb12a16cb83138f26e6f36dec967", "image_content": ">"} +{"creator_uname": "maplesbranch", "latitude": "None", "longitude": "None", "date_taken": "2006-05-01 07:34:13", "hash": "598ad3bc7e6e876e61af116693c7ad9", "image_content": ">"} +{"creator_uname": "maplesbranch", "latitude": "None", "longitude": "None", "date_taken": "2006-04-23 19:20:40", "hash": "e48d6d552465c5728585b82a53d6e02c", "image_content": ">"} +{"creator_uname": "quinnums", "latitude": "None", "longitude": "None", "date_taken": "2004-05-17 00:44:29", "hash": "ffd3eb12a16cb83138f26e6f36dec967", "image_content": ">"} {"creator_uname": "striatic", "latitude": 53.535233, "longitude": -113.565075, "date_taken": "2004-05-11 02:00:33", "hash": "fff0eece99cc71c2e91fe716051599", "image_content": ">"} -{"creator_uname": "striatic", "latitude": null, "longitude": null, "date_taken": "2004-05-27 10:34:28", "hash": "fedefe9f11bf2a749a749bfca8bf28", "image_content": ">"} -{"creator_uname": "quinnums", "latitude": null, "longitude": null, "date_taken": "2004-05-29 02:14:36", "hash": "ff379727f52bcec4dfb237ace41627", "image_content": ">"} +{"creator_uname": "striatic", "latitude": "None", "longitude": "None", "date_taken": "2004-05-27 10:34:28", "hash": "fedefe9f11bf2a749a749bfca8bf28", "image_content": ">"} +{"creator_uname": "quinnums", "latitude": "None", "longitude": "None", "date_taken": "2004-05-29 02:14:36", "hash": "ff379727f52bcec4dfb237ace41627", "image_content": ">"} diff --git a/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl b/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl index e6e859717..6dcef5645 100644 --- a/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl +++ b/datasets/1.0/huggingface-levanti/output/levanti_train.jsonl @@ -1,10 +1,10 @@ -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0627\u0646\u0627 \u062d\u0627\u0628\u0628 \u0627\u062d\u0643\u064a \u0645\u0639\u0643.", "levanti/hebrew": "\u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6\u05d4 \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da.", "levanti/english": "I want to talk to you.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0645\u0631\u062d\u0628\u0627 \u0623\u0631\u064a\u062c. -\u0645\u0631\u062d\u0628\u0627 \u062c\u0644\u0627\u0644.", "levanti/hebrew": "\u05e9\u05dc\u05d5\u05dd, \u05d0\u05e8\u05d9\u05d2'. -\u05e9\u05dc\u05d5\u05dd, \u05d2'\u05dc\u05d0\u05dc.", "levanti/english": "Hello, Areej. -Hello, Jalal.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0648\u0625\u0630\u0627 \u0645\u0634 \u0646\u0633\u0627\u064a\u0628\u0646\u0627 \u0648\u0644\u0627\u062f \u0639\u0645\u0651\u0646\u0627!", "levanti/hebrew": "\u05d5\u05d0\u05dd \u05dc\u05d0 \u05e7\u05e8\u05d5\u05d1\u05d9\u05dd \u05e9\u05dc\u05e0\u05d5 \u05d4\u05dd \u05d1\u05e0\u05d9 \u05d3\u05d5\u05d3\u05d9\u05e0\u05d5!", "levanti/english": "And if they are not our relatives, they are our cousins!", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Jordanian", "levanti/arabic": "\u0647\u0645 \u0645\u0634 \u0639\u0627\u062c\u0628\u0647\u0645 \u0625\u0646\u0648 \u0645\u062f\u064a\u0631\u062a\u0647\u0645 \u0633\u062a", "levanti/hebrew": "\u05d4\u05dd \u05dc\u05d0 \u05de\u05e8\u05d5\u05e6\u05d9\u05dd \u05de\u05db\u05da \u05e9\u05d4\u05de\u05e0\u05d4\u05dc\u05ea \u05e9\u05dc\u05d4\u05dd \u05d0\u05d9\u05e9\u05d4", "levanti/english": "They don't like that their manager is a woman.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Syrian", "levanti/arabic": "\u200f\u0627\u0644\u0634\u0628\u0643\u0627\u062a \u0643\u0627\u0646\u062a \u0645\u0644\u0627\u0646\u0629 \u0633\u0645\u0643 \u200e", "levanti/hebrew": "\u05d4\u05e8\u05e9\u05ea\u05d5\u05ea \u05d4\u05d9\u05d5 \u05de\u05dc\u05d0\u05d5\u05ea \u05d3\u05d2\u05d9\u05dd", "levanti/english": "The nets were full of fish.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u200f\u0641\u064a\u0647 \u0645\u0631\u0627\u0643\u0632 \u0644\u0644\u062a\u062f\u0631\u064a\u0628 \u0627\u0644\u0645\u0647\u0646\u064a \u200e", "levanti/hebrew": "\u05d9\u05e9\u05e0\u05dd \u05de\u05e8\u05db\u05d6\u05d9\u05dd \u05dc\u05d4\u05db\u05e9\u05e8\u05d4 \u05de\u05e7\u05e6\u05d5\u05e2\u05d9\u05ea", "levanti/english": "There are centers for vocational training.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0627\u0642\u0639\u062f\u064a \u0648\u0627\u0633\u0643\u062a\u064a! - \u0633\u062f\u064a \u0628\u0648\u0632\u0643.", "levanti/hebrew": "\u05ea\u05e1\u05ea\u05de\u05d9 \u05d0\u05ea \u05d4\u05e4\u05d4.", "levanti/english": "Sit down and shut up! - Shut your mouth.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0645\u0646\u0630 \u0632\u0645\u0646 \u0637\u0648\u064a\u0644 \u064a\u0628\u062d\u062b \u0627\u0644\u064a\u0647\u0648\u062f \u0627\u0644\u0645\u062a\u062f\u064a\u0651\u0646\u0648\u0646 \u0639\u0646 \u0627\u0644\u0628\u0642\u0631\u0629 \u0627\u0644\u062d\u0645\u0631\u0627\u0621", "levanti/hebrew": "\u05d6\u05d4 \u05d6\u05de\u05df \u05e8\u05d1 \u05de\u05d7\u05e4\u05e9\u05d9\u05dd \u05d4\u05d9\u05d4\u05d5\u05d3\u05d9\u05dd \u05d4\u05d3\u05ea\u05d9\u05d9\u05dd \u05d0\u05ea \u05d4\u05e4\u05e8\u05d4 \u05d4\u05d0\u05d3\u05d5\u05de\u05d4.", "levanti/english": "For a long time, religious Jews have been searching for the red heifer.", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Levantine", "levanti/arabic": "\u0623\u0648\u0651\u0644 \u0645\u0648\u0636\u0648\u0639 \u0628\u0627\u062e\u062a\u0635\u0627\u0631 \u0643\u0628\u064a\u0631\u060c", "levanti/hebrew": "\u05d4\u05e0\u05d5\u05e9\u05d0 \u05d4\u05e8\u05d0\u05e9\u05d5\u05df, \u05d1\u05e7\u05d9\u05e6\u05d5\u05e8 \u05e0\u05de\u05e8\u05e5,", "levanti/english": "The first topic, in brief,", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} -{"levanti/dialect": "Jordanian", "levanti/arabic": "\u064a\u0627 \u0648\u0644\u0627\u062f \u064a\u0644\u0627 \u062a\u0623\u062e\u0631\u0646\u0627\u060c \u0634\u0648 \u0628\u062a\u0633\u0627\u0648\u0627\u061f", "levanti/hebrew": "\u05d9\u05dc\u05d3\u05d9\u05dd! \u05d1\u05d5\u05d0\u05d5, \u05d0\u05e0\u05d7\u05e0\u05d5 \u05de\u05d0\u05d7\u05e8\u05d9\u05dd! \u05de\u05d4 \u05d0\u05ea\u05dd \u05e2\u05d5\u05e9\u05d9\u05dd?", "levanti/english": "Kids, let's go, we're late! What are you doing?", "levanti/synthesized": false, "levanti/diacritized": null, "levanti/hebrew_taatik_EXP": null, "levanti/english_translit_EXP": null} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0627\u0646\u0627 \u062d\u0627\u0628\u0628 \u0627\u062d\u0643\u064a \u0645\u0639\u0643.", "levanti/hebrew": "\u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6\u05d4 \u05dc\u05d3\u05d1\u05e8 \u05d0\u05d9\u05ea\u05da.", "levanti/english": "I want to talk to you.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0645\u0631\u062d\u0628\u0627 \u0623\u0631\u064a\u062c. -\u0645\u0631\u062d\u0628\u0627 \u062c\u0644\u0627\u0644.", "levanti/hebrew": "\u05e9\u05dc\u05d5\u05dd, \u05d0\u05e8\u05d9\u05d2'. -\u05e9\u05dc\u05d5\u05dd, \u05d2'\u05dc\u05d0\u05dc.", "levanti/english": "Hello, Areej. -Hello, Jalal.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0648\u0625\u0630\u0627 \u0645\u0634 \u0646\u0633\u0627\u064a\u0628\u0646\u0627 \u0648\u0644\u0627\u062f \u0639\u0645\u0651\u0646\u0627!", "levanti/hebrew": "\u05d5\u05d0\u05dd \u05dc\u05d0 \u05e7\u05e8\u05d5\u05d1\u05d9\u05dd \u05e9\u05dc\u05e0\u05d5 \u05d4\u05dd \u05d1\u05e0\u05d9 \u05d3\u05d5\u05d3\u05d9\u05e0\u05d5!", "levanti/english": "And if they are not our relatives, they are our cousins!", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Jordanian", "levanti/arabic": "\u0647\u0645 \u0645\u0634 \u0639\u0627\u062c\u0628\u0647\u0645 \u0625\u0646\u0648 \u0645\u062f\u064a\u0631\u062a\u0647\u0645 \u0633\u062a", "levanti/hebrew": "\u05d4\u05dd \u05dc\u05d0 \u05de\u05e8\u05d5\u05e6\u05d9\u05dd \u05de\u05db\u05da \u05e9\u05d4\u05de\u05e0\u05d4\u05dc\u05ea \u05e9\u05dc\u05d4\u05dd \u05d0\u05d9\u05e9\u05d4", "levanti/english": "They don't like that their manager is a woman.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Syrian", "levanti/arabic": "\u200f\u0627\u0644\u0634\u0628\u0643\u0627\u062a \u0643\u0627\u0646\u062a \u0645\u0644\u0627\u0646\u0629 \u0633\u0645\u0643 \u200e", "levanti/hebrew": "\u05d4\u05e8\u05e9\u05ea\u05d5\u05ea \u05d4\u05d9\u05d5 \u05de\u05dc\u05d0\u05d5\u05ea \u05d3\u05d2\u05d9\u05dd", "levanti/english": "The nets were full of fish.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u200f\u0641\u064a\u0647 \u0645\u0631\u0627\u0643\u0632 \u0644\u0644\u062a\u062f\u0631\u064a\u0628 \u0627\u0644\u0645\u0647\u0646\u064a \u200e", "levanti/hebrew": "\u05d9\u05e9\u05e0\u05dd \u05de\u05e8\u05db\u05d6\u05d9\u05dd \u05dc\u05d4\u05db\u05e9\u05e8\u05d4 \u05de\u05e7\u05e6\u05d5\u05e2\u05d9\u05ea", "levanti/english": "There are centers for vocational training.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Palestinian", "levanti/arabic": "\u0627\u0642\u0639\u062f\u064a \u0648\u0627\u0633\u0643\u062a\u064a! - \u0633\u062f\u064a \u0628\u0648\u0632\u0643.", "levanti/hebrew": "\u05ea\u05e1\u05ea\u05de\u05d9 \u05d0\u05ea \u05d4\u05e4\u05d4.", "levanti/english": "Sit down and shut up! - Shut your mouth.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0645\u0646\u0630 \u0632\u0645\u0646 \u0637\u0648\u064a\u0644 \u064a\u0628\u062d\u062b \u0627\u0644\u064a\u0647\u0648\u062f \u0627\u0644\u0645\u062a\u062f\u064a\u0651\u0646\u0648\u0646 \u0639\u0646 \u0627\u0644\u0628\u0642\u0631\u0629 \u0627\u0644\u062d\u0645\u0631\u0627\u0621", "levanti/hebrew": "\u05d6\u05d4 \u05d6\u05de\u05df \u05e8\u05d1 \u05de\u05d7\u05e4\u05e9\u05d9\u05dd \u05d4\u05d9\u05d4\u05d5\u05d3\u05d9\u05dd \u05d4\u05d3\u05ea\u05d9\u05d9\u05dd \u05d0\u05ea \u05d4\u05e4\u05e8\u05d4 \u05d4\u05d0\u05d3\u05d5\u05de\u05d4.", "levanti/english": "For a long time, religious Jews have been searching for the red heifer.", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Levantine", "levanti/arabic": "\u0623\u0648\u0651\u0644 \u0645\u0648\u0636\u0648\u0639 \u0628\u0627\u062e\u062a\u0635\u0627\u0631 \u0643\u0628\u064a\u0631\u060c", "levanti/hebrew": "\u05d4\u05e0\u05d5\u05e9\u05d0 \u05d4\u05e8\u05d0\u05e9\u05d5\u05df, \u05d1\u05e7\u05d9\u05e6\u05d5\u05e8 \u05e0\u05de\u05e8\u05e5,", "levanti/english": "The first topic, in brief,", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} +{"levanti/dialect": "Jordanian", "levanti/arabic": "\u064a\u0627 \u0648\u0644\u0627\u062f \u064a\u0644\u0627 \u062a\u0623\u062e\u0631\u0646\u0627\u060c \u0634\u0648 \u0628\u062a\u0633\u0627\u0648\u0627\u061f", "levanti/hebrew": "\u05d9\u05dc\u05d3\u05d9\u05dd! \u05d1\u05d5\u05d0\u05d5, \u05d0\u05e0\u05d7\u05e0\u05d5 \u05de\u05d0\u05d7\u05e8\u05d9\u05dd! \u05de\u05d4 \u05d0\u05ea\u05dd \u05e2\u05d5\u05e9\u05d9\u05dd?", "levanti/english": "Kids, let's go, we're late! What are you doing?", "levanti/synthesized": false, "levanti/diacritized": "None", "levanti/hebrew_taatik_EXP": "None", "levanti/english_translit_EXP": "None"} diff --git a/datasets/1.0/huggingface-open-hermes/output/default.jsonl b/datasets/1.0/huggingface-open-hermes/output/default.jsonl index 8cc470060..9e7724c86 100644 --- a/datasets/1.0/huggingface-open-hermes/output/default.jsonl +++ b/datasets/1.0/huggingface-open-hermes/output/default.jsonl @@ -1,3 +1,3 @@ -{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} -{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "multiple_choice", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} -{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "multiple_choice", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": "None", "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} diff --git a/datasets/1.0/pass-mini/output/images.jsonl b/datasets/1.0/pass-mini/output/images.jsonl index 588e01917..377f2cd99 100644 --- a/datasets/1.0/pass-mini/output/images.jsonl +++ b/datasets/1.0/pass-mini/output/images.jsonl @@ -1,8 +1,8 @@ -{"images/hash": "75f7305b1fd94044e14bdcdde469dbb2", "images/image_content": ">", "images/creator_uname": "PaperBird+Photography%3C3", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2007-05-06 06:11:48"} +{"images/hash": "75f7305b1fd94044e14bdcdde469dbb2", "images/image_content": ">", "images/creator_uname": "PaperBird+Photography%3C3", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2007-05-06 06:11:48"} {"images/hash": "dd571a41a015354d92a859f7ef31201", "images/image_content": ">", "images/creator_uname": "Chiara+Marra", "images/gps_coordinates": {"images/latitude": 38.23818, "images/longitude": 13.183593}, "images/date_taken": "2007-05-04 15:46:43"} -{"images/hash": "598ad3bc7e6e876e61af116693c7ad9", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2006-05-01 07:34:13"} -{"images/hash": "e48d6d552465c5728585b82a53d6e02c", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2006-04-23 19:20:40"} -{"images/hash": "ffd3eb12a16cb83138f26e6f36dec967", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-17 00:44:29"} +{"images/hash": "598ad3bc7e6e876e61af116693c7ad9", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2006-05-01 07:34:13"} +{"images/hash": "e48d6d552465c5728585b82a53d6e02c", "images/image_content": ">", "images/creator_uname": "maplesbranch", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2006-04-23 19:20:40"} +{"images/hash": "ffd3eb12a16cb83138f26e6f36dec967", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-17 00:44:29"} {"images/hash": "fff0eece99cc71c2e91fe716051599", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": 53.535233, "images/longitude": -113.565075}, "images/date_taken": "2004-05-11 02:00:33"} -{"images/hash": "fedefe9f11bf2a749a749bfca8bf28", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-27 10:34:28"} -{"images/hash": "ff379727f52bcec4dfb237ace41627", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": null, "images/longitude": null}, "images/date_taken": "2004-05-29 02:14:36"} +{"images/hash": "fedefe9f11bf2a749a749bfca8bf28", "images/image_content": ">", "images/creator_uname": "striatic", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-27 10:34:28"} +{"images/hash": "ff379727f52bcec4dfb237ace41627", "images/image_content": ">", "images/creator_uname": "quinnums", "images/gps_coordinates": {"images/latitude": "None", "images/longitude": "None"}, "images/date_taken": "2004-05-29 02:14:36"} diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index ce7beb16f..5abea2c27 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -38,6 +38,8 @@ def _is_repeated_field(field: Field | None) -> bool | None: def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: """Applies one transform to `value`.""" + if _is_na(value): + return value if transform.regex is not None: source_regex = re.compile(transform.regex) if isinstance(value, pathlib.PurePath): @@ -57,7 +59,7 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: return pd.Timestamp(value).strftime(transform.format) else: raise ValueError(f"`format` only applies to dates. Got {field.data_type}") - elif transform.separator is not None and not _is_na(value): + elif transform.separator is not None: return value.split(transform.separator) return value @@ -157,18 +159,19 @@ def _populate_repeated_nested_subfield( "Nested subfields can only be populated when the parent field exists!" ) parent_id = field.parent.id - if parent_id not in result: - result[parent_id] = ( - [{field.id: v} for v in value] if not _is_na(value) else [{field.id: value}] - ) - else: - if not _is_na(value) and len(value) != len(result[parent_id]): + existing_values = result.get(parent_id, None) + if existing_values: + if not _is_na(value) and len(value) != len(existing_values): raise ValueError( f"Lenghts of {field.id} doesn't match already stored items for " f" {parent_id}" ) - for i in range(len(result[parent_id])): - result[parent_id][i][field.id] = value[i] if not _is_na(value) else value + for i in range(len(existing_values)): + existing_values[i][field.id] = None if _is_na(value) else value[i] + else: + result[parent_id] = ( + [{field.id: None}] if _is_na(value) else [{field.id: v} for v in value] + ) return result @@ -214,19 +217,21 @@ def _get_result(row): f'Column "{column}" does not exist. Inspect the ancestors of the' f" field {field} to understand why. Possible fields: {df.columns}" ) - value = row[column] is_repeated = field.repeated or ( field.parent and _is_repeated_field(field.parent) ) - value = apply_transforms_fn(value, field=field, repeated=is_repeated) - if is_repeated: - value = ( - [_cast_value(self.node.ctx, v, field.data_type) for v in value] - if not _is_na(value) - else value - ) + value = apply_transforms_fn( + row[column], field=field, repeated=is_repeated + ) + if _is_na(value): + value = None + elif is_repeated: + value = [ + _cast_value(self.node.ctx, v, field.data_type) for v in value + ] else: value = _cast_value(self.node.ctx, value, field.data_type) + if self.node.ctx.is_v0(): result[field.name] = value else: diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py index 33fe04cfa..eff8d36c7 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py @@ -203,34 +203,14 @@ def test_readfield_with_subfields(): }, "main/checked_users": [ { - "main/checked_users/name": float("nan"), - "main/checked_users/surname": float("nan"), + "main/checked_users/name": None, + "main/checked_users/surname": None, }, ], }, ] result = list(read_field.call(df)) - for i in range(len(result)): - assert result[i]["main/coordinates"] == expected[i]["main/coordinates"] - if not field._is_na( - result[i]["main/checked_users"][0]["main/checked_users/name"] - ): - assert ( - result[i]["main/checked_users"] - == expected[i]["main/checked_users"] - ) - else: - assert field._is_na( - expected[i]["main/checked_users"][0]["main/checked_users/name"] - ) - assert field._is_na( - expected[i]["main/checked_users"][0][ - "main/checked_users/surname" - ] - ) - assert field._is_na( - result[i]["main/checked_users"][0]["main/checked_users/surname"] - ) + assert result == expected @pytest.mark.parametrize(