Skip to content

Commit

Permalink
Add dataset with none/nan repeated values
Browse files Browse the repository at this point in the history
  • Loading branch information
ccl-core committed Oct 22, 2024
1 parent 60233a4 commit ce6f8ab
Show file tree
Hide file tree
Showing 5 changed files with 399 additions and 20 deletions.
323 changes: 323 additions & 0 deletions datasets/1.0/huggingface-open-hermes/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataBiases": "cr:dataBiases",
"dataCollection": "cr:dataCollection",
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"personalSensitiveInformation": "cr:personalSensitiveInformation",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"sc": "https://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform"
},
"@type": "sc:Dataset",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "repo",
"name": "repo",
"description": "The Hugging Face git repository.",
"contentUrl": "https://huggingface.co/datasets/teknium/OpenHermes-2.5/tree/refs%2Fconvert%2Fparquet",
"encodingFormat": "git+https",
"sha256": "https://github.com/mlcommons/croissant/issues/80"
},
{
"@type": "cr:FileSet",
"@id": "parquet-files-for-config-default",
"name": "parquet-files-for-config-default",
"description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
"containedIn": {
"@id": "repo"
},
"encodingFormat": "application/x-parquet",
"includes": "default/*/*.parquet"
}
],
"recordSet": [
{
"@type": "cr:RecordSet",
"@id": "default",
"name": "default",
"description": "teknium/OpenHermes-2.5 - 'default' subset\n\nAdditional information:\n- 1 skipped column: conversations",
"field": [
{
"@type": "cr:Field",
"@id": "default/custom_instruction",
"name": "default/custom_instruction",
"description": "Column 'custom_instruction' from the Hugging Face parquet file.",
"dataType": "sc:Boolean",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "custom_instruction"
}
}
},
{
"@type": "cr:Field",
"@id": "default/topic",
"name": "default/topic",
"description": "Column 'topic' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "topic"
}
}
},
{
"@type": "cr:Field",
"@id": "default/model_name",
"name": "default/model_name",
"description": "Column 'model_name' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "model_name"
}
}
},
{
"@type": "cr:Field",
"@id": "default/model",
"name": "default/model",
"description": "Column 'model' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "model"
}
}
},
{
"@type": "cr:Field",
"@id": "default/skip_prompt_formatting",
"name": "default/skip_prompt_formatting",
"description": "Column 'skip_prompt_formatting' from the Hugging Face parquet file.",
"dataType": "sc:Boolean",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "skip_prompt_formatting"
}
}
},
{
"@type": "cr:Field",
"@id": "default/category",
"name": "default/category",
"description": "Column 'category' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "category"
}
}
},
{
"@type": "cr:Field",
"@id": "default/views",
"name": "default/views",
"description": "Column 'views' from the Hugging Face parquet file.",
"dataType": "sc:Integer",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "views"
}
}
},
{
"@type": "cr:Field",
"@id": "default/language",
"name": "default/language",
"description": "Column 'language' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "language"
}
}
},
{
"@type": "cr:Field",
"@id": "default/id",
"name": "default/id",
"description": "Column 'id' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "id"
}
}
},
{
"@type": "cr:Field",
"@id": "default/title",
"name": "default/title",
"description": "Column 'title' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "title"
}
}
},
{
"@type": "cr:Field",
"@id": "default/idx",
"name": "default/idx",
"description": "Column 'idx' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "idx"
}
}
},
{
"@type": "cr:Field",
"@id": "default/hash",
"name": "default/hash",
"description": "Column 'hash' from the Hugging Face parquet file.",
"dataType": "sc:Integer",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "hash"
}
},
"repeated": true
},
{
"@type": "cr:Field",
"@id": "default/avatarUrl",
"name": "default/avatarUrl",
"description": "Column 'avatarUrl' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "avatarUrl"
}
}
},
{
"@type": "cr:Field",
"@id": "default/system_prompt",
"name": "default/system_prompt",
"description": "Column 'system_prompt' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "system_prompt"
}
}
},
{
"@type": "cr:Field",
"@id": "default/source",
"name": "default/source",
"description": "Column 'source' from the Hugging Face parquet file.",
"dataType": "sc:Text",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-default"
},
"extract": {
"column": "source"
}
}
}
]
}
],
"conformsTo": "http://mlcommons.org/croissant/1.0",
"name": "OpenHermes-2.5",
"description": "\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for Dataset Name\n\t\n\nThis is the dataset that made OpenHermes 2.5 and Nous Hermes 2 series of models.\nSupport me on GitHub sponsors \u003C3 : https://github.com/sponsors/teknium1\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Details\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Description\n\t\n\nThe Open Hermes 2/2.5 and Nous Hermes 2 models have made significant advancements of SOTA LLM's over recent months, and are underpinned by this exact compilation and curation of many open source datasets and custom created synthetic… See the full description on the dataset page: https://huggingface.co/datasets/teknium/OpenHermes-2.5.",
"keywords": [
"English",
"1M - 10M",
"json",
"Text",
"Datasets",
"pandas",
"Croissant",
"Polars",
"🇺🇸 Region: US",
"Synthetic",
"GPT-4",
"Distillation",
"Compilation"
],
"url": "https://huggingface.co/datasets/teknium/OpenHermes-2.5"
}
3 changes: 3 additions & 0 deletions datasets/1.0/huggingface-open-hermes/output/default.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"}
{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "multiple_choice", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"}
{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"}
1 change: 1 addition & 0 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records
["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None],
["huggingface-c4/metadata.json", "data", 1, {"data/variant": "en"}],
["huggingface-levanti/metadata.json", "levanti_train", 10, None],
["huggingface-open-hermes/metadata.json", "default", 3, None],
],
)
def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters):
Expand Down
Loading

0 comments on commit ce6f8ab

Please sign in to comment.