Add dataset with none/nan repeated values

mlcommons · Oct 22, 2024 · ce6f8ab · ce6f8ab
1 parent 60233a4
commit ce6f8ab
Show file tree

Hide file tree

Showing 5 changed files with 399 additions and 20 deletions.
diff --git a/datasets/1.0/huggingface-open-hermes/metadata.json b/datasets/1.0/huggingface-open-hermes/metadata.json
@@ -0,0 +1,323 @@
+{
+  "@context": {
+    "@language": "en",
+    "@vocab": "https://schema.org/",
+    "citeAs": "cr:citeAs",
+    "column": "cr:column",
+    "conformsTo": "dct:conformsTo",
+    "cr": "http://mlcommons.org/croissant/",
+    "data": {
+      "@id": "cr:data",
+      "@type": "@json"
+    },
+    "dataBiases": "cr:dataBiases",
+    "dataCollection": "cr:dataCollection",
+    "dataType": {
+      "@id": "cr:dataType",
+      "@type": "@vocab"
+    },
+    "dct": "http://purl.org/dc/terms/",
+    "extract": "cr:extract",
+    "field": "cr:field",
+    "fileProperty": "cr:fileProperty",
+    "fileObject": "cr:fileObject",
+    "fileSet": "cr:fileSet",
+    "format": "cr:format",
+    "includes": "cr:includes",
+    "isLiveDataset": "cr:isLiveDataset",
+    "jsonPath": "cr:jsonPath",
+    "key": "cr:key",
+    "md5": "cr:md5",
+    "parentField": "cr:parentField",
+    "path": "cr:path",
+    "personalSensitiveInformation": "cr:personalSensitiveInformation",
+    "recordSet": "cr:recordSet",
+    "references": "cr:references",
+    "regex": "cr:regex",
+    "repeated": "cr:repeated",
+    "replace": "cr:replace",
+    "sc": "https://schema.org/",
+    "separator": "cr:separator",
+    "source": "cr:source",
+    "subField": "cr:subField",
+    "transform": "cr:transform"
+  },
+  "@type": "sc:Dataset",
+  "distribution": [
+    {
+      "@type": "cr:FileObject",
+      "@id": "repo",
+      "name": "repo",
+      "description": "The Hugging Face git repository.",
+      "contentUrl": "https://huggingface.co/datasets/teknium/OpenHermes-2.5/tree/refs%2Fconvert%2Fparquet",
+      "encodingFormat": "git+https",
+      "sha256": "https://github.com/mlcommons/croissant/issues/80"
+    },
+    {
+      "@type": "cr:FileSet",
+      "@id": "parquet-files-for-config-default",
+      "name": "parquet-files-for-config-default",
+      "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
+      "containedIn": {
+        "@id": "repo"
+      },
+      "encodingFormat": "application/x-parquet",
+      "includes": "default/*/*.parquet"
+    }
+  ],
+  "recordSet": [
+    {
+      "@type": "cr:RecordSet",
+      "@id": "default",
+      "name": "default",
+      "description": "teknium/OpenHermes-2.5 - 'default' subset\n\nAdditional information:\n- 1 skipped column: conversations",
+      "field": [
+        {
+          "@type": "cr:Field",
+          "@id": "default/custom_instruction",
+          "name": "default/custom_instruction",
+          "description": "Column 'custom_instruction' from the Hugging Face parquet file.",
+          "dataType": "sc:Boolean",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "custom_instruction"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/topic",
+          "name": "default/topic",
+          "description": "Column 'topic' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "topic"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/model_name",
+          "name": "default/model_name",
+          "description": "Column 'model_name' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "model_name"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/model",
+          "name": "default/model",
+          "description": "Column 'model' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "model"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/skip_prompt_formatting",
+          "name": "default/skip_prompt_formatting",
+          "description": "Column 'skip_prompt_formatting' from the Hugging Face parquet file.",
+          "dataType": "sc:Boolean",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "skip_prompt_formatting"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/category",
+          "name": "default/category",
+          "description": "Column 'category' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "category"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/views",
+          "name": "default/views",
+          "description": "Column 'views' from the Hugging Face parquet file.",
+          "dataType": "sc:Integer",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "views"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/language",
+          "name": "default/language",
+          "description": "Column 'language' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "language"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/id",
+          "name": "default/id",
+          "description": "Column 'id' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "id"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/title",
+          "name": "default/title",
+          "description": "Column 'title' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "title"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/idx",
+          "name": "default/idx",
+          "description": "Column 'idx' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "idx"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/hash",
+          "name": "default/hash",
+          "description": "Column 'hash' from the Hugging Face parquet file.",
+          "dataType": "sc:Integer",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "hash"
+            }
+          },
+          "repeated": true
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/avatarUrl",
+          "name": "default/avatarUrl",
+          "description": "Column 'avatarUrl' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "avatarUrl"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/system_prompt",
+          "name": "default/system_prompt",
+          "description": "Column 'system_prompt' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "system_prompt"
+            }
+          }
+        },
+        {
+          "@type": "cr:Field",
+          "@id": "default/source",
+          "name": "default/source",
+          "description": "Column 'source' from the Hugging Face parquet file.",
+          "dataType": "sc:Text",
+          "source": {
+            "fileSet": {
+              "@id": "parquet-files-for-config-default"
+            },
+            "extract": {
+              "column": "source"
+            }
+          }
+        }
+      ]
+    }
+  ],
+  "conformsTo": "http://mlcommons.org/croissant/1.0",
+  "name": "OpenHermes-2.5",
+  "description": "\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for Dataset Name\n\t\n\nThis is the dataset that made OpenHermes 2.5 and Nous Hermes 2 series of models.\nSupport me on GitHub sponsors \u003C3 : https://github.com/sponsors/teknium1\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Details\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Description\n\t\n\nThe Open Hermes 2/2.5 and Nous Hermes 2 models have made significant advancements of SOTA LLM's over recent months, and are underpinned by this exact compilation and curation of many open source datasets and custom created synthetic… See the full description on the dataset page: https://huggingface.co/datasets/teknium/OpenHermes-2.5.",
+  "keywords": [
+    "English",
+    "1M - 10M",
+    "json",
+    "Text",
+    "Datasets",
+    "pandas",
+    "Croissant",
+    "Polars",
+    "🇺🇸 Region: US",
+    "Synthetic",
+    "GPT-4",
+    "Distillation",
+    "Compilation"
+  ],
+  "url": "https://huggingface.co/datasets/teknium/OpenHermes-2.5"
+}
diff --git a/datasets/1.0/huggingface-open-hermes/output/default.jsonl b/datasets/1.0/huggingface-open-hermes/output/default.jsonl
@@ -0,0 +1,3 @@
+{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"}
+{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "multiple_choice", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"}
+{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"}
diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py
@@ -247,6 +247,7 @@ def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records
         ["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None],
         ["huggingface-c4/metadata.json", "data", 1, {"data/variant": "en"}],
         ["huggingface-levanti/metadata.json", "levanti_train", 10, None],
+        ["huggingface-open-hermes/metadata.json", "default", 3, None],
     ],
 )
 def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters):