diff --git a/datasets/1.0/huggingface-open-hermes/metadata.json b/datasets/1.0/huggingface-open-hermes/metadata.json new file mode 100644 index 000000000..2f4f4b0fc --- /dev/null +++ b/datasets/1.0/huggingface-open-hermes/metadata.json @@ -0,0 +1,323 @@ +{ + "@context": { + "@language": "en", + "@vocab": "https://schema.org/", + "citeAs": "cr:citeAs", + "column": "cr:column", + "conformsTo": "dct:conformsTo", + "cr": "http://mlcommons.org/croissant/", + "data": { + "@id": "cr:data", + "@type": "@json" + }, + "dataBiases": "cr:dataBiases", + "dataCollection": "cr:dataCollection", + "dataType": { + "@id": "cr:dataType", + "@type": "@vocab" + }, + "dct": "http://purl.org/dc/terms/", + "extract": "cr:extract", + "field": "cr:field", + "fileProperty": "cr:fileProperty", + "fileObject": "cr:fileObject", + "fileSet": "cr:fileSet", + "format": "cr:format", + "includes": "cr:includes", + "isLiveDataset": "cr:isLiveDataset", + "jsonPath": "cr:jsonPath", + "key": "cr:key", + "md5": "cr:md5", + "parentField": "cr:parentField", + "path": "cr:path", + "personalSensitiveInformation": "cr:personalSensitiveInformation", + "recordSet": "cr:recordSet", + "references": "cr:references", + "regex": "cr:regex", + "repeated": "cr:repeated", + "replace": "cr:replace", + "sc": "https://schema.org/", + "separator": "cr:separator", + "source": "cr:source", + "subField": "cr:subField", + "transform": "cr:transform" + }, + "@type": "sc:Dataset", + "distribution": [ + { + "@type": "cr:FileObject", + "@id": "repo", + "name": "repo", + "description": "The Hugging Face git repository.", + "contentUrl": "https://huggingface.co/datasets/teknium/OpenHermes-2.5/tree/refs%2Fconvert%2Fparquet", + "encodingFormat": "git+https", + "sha256": "https://github.com/mlcommons/croissant/issues/80" + }, + { + "@type": "cr:FileSet", + "@id": "parquet-files-for-config-default", + "name": "parquet-files-for-config-default", + "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).", + "containedIn": { + "@id": "repo" + }, + "encodingFormat": "application/x-parquet", + "includes": "default/*/*.parquet" + } + ], + "recordSet": [ + { + "@type": "cr:RecordSet", + "@id": "default", + "name": "default", + "description": "teknium/OpenHermes-2.5 - 'default' subset\n\nAdditional information:\n- 1 skipped column: conversations", + "field": [ + { + "@type": "cr:Field", + "@id": "default/custom_instruction", + "name": "default/custom_instruction", + "description": "Column 'custom_instruction' from the Hugging Face parquet file.", + "dataType": "sc:Boolean", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "custom_instruction" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/topic", + "name": "default/topic", + "description": "Column 'topic' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "topic" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/model_name", + "name": "default/model_name", + "description": "Column 'model_name' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "model_name" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/model", + "name": "default/model", + "description": "Column 'model' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "model" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/skip_prompt_formatting", + "name": "default/skip_prompt_formatting", + "description": "Column 'skip_prompt_formatting' from the Hugging Face parquet file.", + "dataType": "sc:Boolean", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "skip_prompt_formatting" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/category", + "name": "default/category", + "description": "Column 'category' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "category" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/views", + "name": "default/views", + "description": "Column 'views' from the Hugging Face parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "views" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/language", + "name": "default/language", + "description": "Column 'language' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "language" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/id", + "name": "default/id", + "description": "Column 'id' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "id" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/title", + "name": "default/title", + "description": "Column 'title' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "title" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/idx", + "name": "default/idx", + "description": "Column 'idx' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "idx" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/hash", + "name": "default/hash", + "description": "Column 'hash' from the Hugging Face parquet file.", + "dataType": "sc:Integer", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "hash" + } + }, + "repeated": true + }, + { + "@type": "cr:Field", + "@id": "default/avatarUrl", + "name": "default/avatarUrl", + "description": "Column 'avatarUrl' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "avatarUrl" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/system_prompt", + "name": "default/system_prompt", + "description": "Column 'system_prompt' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "system_prompt" + } + } + }, + { + "@type": "cr:Field", + "@id": "default/source", + "name": "default/source", + "description": "Column 'source' from the Hugging Face parquet file.", + "dataType": "sc:Text", + "source": { + "fileSet": { + "@id": "parquet-files-for-config-default" + }, + "extract": { + "column": "source" + } + } + } + ] + } + ], + "conformsTo": "http://mlcommons.org/croissant/1.0", + "name": "OpenHermes-2.5", + "description": "\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for Dataset Name\n\t\n\nThis is the dataset that made OpenHermes 2.5 and Nous Hermes 2 series of models.\nSupport me on GitHub sponsors \u003C3 : https://github.com/sponsors/teknium1\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Details\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Description\n\t\n\nThe Open Hermes 2/2.5 and Nous Hermes 2 models have made significant advancements of SOTA LLM's over recent months, and are underpinned by this exact compilation and curation of many open source datasets and custom created synthetic… See the full description on the dataset page: https://huggingface.co/datasets/teknium/OpenHermes-2.5.", + "keywords": [ + "English", + "1M - 10M", + "json", + "Text", + "Datasets", + "pandas", + "Croissant", + "Polars", + "🇺🇸 Region: US", + "Synthetic", + "GPT-4", + "Distillation", + "Compilation" + ], + "url": "https://huggingface.co/datasets/teknium/OpenHermes-2.5" +} diff --git a/datasets/1.0/huggingface-open-hermes/output/default.jsonl b/datasets/1.0/huggingface-open-hermes/output/default.jsonl new file mode 100644 index 000000000..8cc470060 --- /dev/null +++ b/datasets/1.0/huggingface-open-hermes/output/default.jsonl @@ -0,0 +1,3 @@ +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "multiple_choice", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} +{"default/custom_instruction": "None", "default/topic": "None", "default/model_name": "None", "default/model": "None", "default/skip_prompt_formatting": false, "default/category": "orca", "default/views": null, "default/language": "None", "default/id": "None", "default/title": "None", "default/idx": "None", "default/hash": "None", "default/avatarUrl": "None", "default/system_prompt": "None", "default/source": "airoboros2.2"} diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py index 79fc1beef..89f52e41b 100644 --- a/python/mlcroissant/mlcroissant/_src/datasets_test.py +++ b/python/mlcroissant/mlcroissant/_src/datasets_test.py @@ -247,6 +247,7 @@ def test_nonhermetic_loading(version, dataset_name, record_set_name, num_records ["huggingface-anthropic-hh-rlhf/metadata.json", "red-team-attempts", 10, None], ["huggingface-c4/metadata.json", "data", 1, {"data/variant": "en"}], ["huggingface-levanti/metadata.json", "levanti_train", 10, None], + ["huggingface-open-hermes/metadata.json", "default", 3, None], ], ) def test_nonhermetic_loading_1_0(dataset_name, record_set_name, num_records, filters): diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index d84a6f65b..ce7beb16f 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -57,7 +57,7 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: return pd.Timestamp(value).strftime(transform.format) else: raise ValueError(f"`format` only applies to dates. Got {field.data_type}") - elif transform.separator is not None: + elif transform.separator is not None and not _is_na(value): return value.split(transform.separator) return value @@ -76,10 +76,13 @@ def apply_transforms_fn(value: Any, field: Field, repeated: bool = False) -> Any return value +def _is_na(value: Any) -> bool: + return not isinstance(value, (list, np.ndarray)) and pd.isna(value) + + def _cast_value(ctx: Context, value: Any, data_type: type | term.URIRef | None): """Casts the value `value` to the desired target data type `data_type`.""" - is_na = not isinstance(value, (list, np.ndarray)) and pd.isna(value) - if is_na: + if _is_na(value): return value elif data_type == DataType.IMAGE_OBJECT: if isinstance(value, deps.PIL_Image.Image): @@ -145,6 +148,30 @@ def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame: return df +def _populate_repeated_nested_subfield( + value: Any, field: Field, result: dict[str, Any] +) -> dict[str, Any]: + """Populates result with a field's nested subfields.""" + if not field.parent: + raise ValueError( + "Nested subfields can only be populated when the parent field exists!" + ) + parent_id = field.parent.id + if parent_id not in result: + result[parent_id] = ( + [{field.id: v} for v in value] if not _is_na(value) else [{field.id: value}] + ) + else: + if not _is_na(value) and len(value) != len(result[parent_id]): + raise ValueError( + f"Lenghts of {field.id} doesn't match already stored items for " + f" {parent_id}" + ) + for i in range(len(result[parent_id])): + result[parent_id][i][field.id] = value[i] if not _is_na(value) else value + return result + + @dataclasses.dataclass(frozen=True, repr=False) class ReadFields(Operation): """Reads fields in a RecordSet from a Pandas DataFrame and applies transformations. @@ -193,9 +220,11 @@ def _get_result(row): ) value = apply_transforms_fn(value, field=field, repeated=is_repeated) if is_repeated: - value = [ - _cast_value(self.node.ctx, v, field.data_type) for v in value - ] + value = ( + [_cast_value(self.node.ctx, v, field.data_type) for v in value] + if not _is_na(value) + else value + ) else: value = _cast_value(self.node.ctx, value, field.data_type) if self.node.ctx.is_v0(): @@ -207,19 +236,9 @@ def _get_result(row): # Repeated nested sub-fields render as a list of dictionaries. if field.parent: if _is_repeated_field(field.parent): - if field.parent.id not in result: - result[field.parent.id] = [ - {field.id: v} for v in value - ] - else: - if len(value) != len(result[field.parent.id]): - raise ValueError( - f"Lenghts of {field.id} doesn't match" - " already stored items for" - f" {field.parent.id}" - ) - for i, v in enumerate(value): - result[field.parent.id][i][field.id] = v + result = _populate_repeated_nested_subfield( + value=value, field=field, result=result + ) # Non-repeated subfields render as a single dictionary. else: if field.parent.id not in result: diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py index 9d76430c9..33fe04cfa 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py @@ -83,6 +83,7 @@ def test_readfield_with_subfields(): f.write("latitude,longitude,names,surnames\n") f.write("1,1,Anna-Maria,Rossi-Bianchi\n") f.write("2,2,Giulia,Ferrari\n") + f.write("1,3,,\n") # Nodes to define metadata. distribution = [ FileObject( @@ -195,9 +196,41 @@ def test_readfield_with_subfields(): }, ], }, + { + "main/coordinates": { + "main/coordinates/latitude": 1, + "main/coordinates/longitude": 3, + }, + "main/checked_users": [ + { + "main/checked_users/name": float("nan"), + "main/checked_users/surname": float("nan"), + }, + ], + }, ] result = list(read_field.call(df)) - assert result == expected + for i in range(len(result)): + assert result[i]["main/coordinates"] == expected[i]["main/coordinates"] + if not field._is_na( + result[i]["main/checked_users"][0]["main/checked_users/name"] + ): + assert ( + result[i]["main/checked_users"] + == expected[i]["main/checked_users"] + ) + else: + assert field._is_na( + expected[i]["main/checked_users"][0]["main/checked_users/name"] + ) + assert field._is_na( + expected[i]["main/checked_users"][0][ + "main/checked_users/surname" + ] + ) + assert field._is_na( + result[i]["main/checked_users"][0]["main/checked_users/surname"] + ) @pytest.mark.parametrize(