Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug with repeated fields. #763

Merged
merged 2 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 209 additions & 0 deletions datasets/1.0/huggingface-the-cauldron/metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataBiases": "cr:dataBiases",
"dataCollection": "cr:dataCollection",
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"dct": "http://purl.org/dc/terms/",
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"personalSensitiveInformation": "cr:personalSensitiveInformation",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"sc": "https://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform"
},
"@type": "sc:Dataset",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "repo",
"name": "repo",
"description": "The Hugging Face git repository.",
"contentUrl": "https://huggingface.co/datasets/HuggingFaceM4/the_cauldron/tree/refs%2Fconvert%2Fparquet",
"encodingFormat": "git+https",
"sha256": "https://github.com/mlcommons/croissant/issues/80"
},
{
"@type": "cr:FileSet",
"@id": "parquet-files-for-config-ai2d",
"name": "parquet-files-for-config-ai2d",
"description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
"containedIn": {
"@id": "repo"
},
"encodingFormat": "application/x-parquet",
"includes": "ai2d/*/*.parquet"
},
{
"@type": "cr:FileSet",
"@id": "parquet-files-for-config-aokvqa",
"name": "parquet-files-for-config-aokvqa",
"description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
"containedIn": {
"@id": "repo"
},
"encodingFormat": "application/x-parquet",
"includes": "aokvqa/*/*.parquet"
},
{
"@type": "cr:FileSet",
"@id": "parquet-files-for-config-chart2text",
"name": "parquet-files-for-config-chart2text",
"description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
"containedIn": {
"@id": "repo"
},
"encodingFormat": "application/x-parquet",
"includes": "chart2text/*/*.parquet"
}
],
"recordSet": [
{
"@type": "cr:RecordSet",
"@id": "ai2d",
"name": "ai2d",
"description": "HuggingFaceM4/the_cauldron - 'ai2d' subset\n\nAdditional information:\n- 1 skipped column: texts",
"field": [
{
"@type": "cr:Field",
"@id": "ai2d/images",
"name": "ai2d/images",
"description": "Image column 'images' from the Hugging Face parquet file.",
"dataType": "sc:ImageObject",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-ai2d"
},
"extract": {
"column": "images"
},
"transform": {
"jsonPath": "bytes"
}
},
"repeated": true
}
]
},
{
"@type": "cr:RecordSet",
"@id": "aokvqa",
"name": "aokvqa",
"description": "HuggingFaceM4/the_cauldron - 'aokvqa' subset\n\nAdditional information:\n- 1 skipped column: texts",
"field": [
{
"@type": "cr:Field",
"@id": "aokvqa/images",
"name": "aokvqa/images",
"description": "Image column 'images' from the Hugging Face parquet file.",
"dataType": "sc:ImageObject",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-aokvqa"
},
"extract": {
"column": "images"
},
"transform": {
"jsonPath": "bytes"
}
},
"repeated": true
}
]
},
{
"@type": "cr:RecordSet",
"@id": "chart2text",
"name": "chart2text",
"description": "HuggingFaceM4/the_cauldron - 'chart2text' subset\n\nAdditional information:\n- 1 skipped column: texts",
"field": [
{
"@type": "cr:Field",
"@id": "chart2text/images",
"name": "chart2text/images",
"description": "Image column 'images' from the Hugging Face parquet file.",
"dataType": "sc:ImageObject",
"source": {
"fileSet": {
"@id": "parquet-files-for-config-chart2text"
},
"extract": {
"column": "images"
},
"transform": {
"jsonPath": "bytes"
}
},
"repeated": true
}
]
}
],
"conformsTo": "http://mlcommons.org/croissant/1.0",
"name": "the_cauldron",
"description": "\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for The Cauldron\n\t\n\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset description\n\t\n\nThe Cauldron is part of the Idefics2 release.\nIt is a massive collection of 50 vision-language datasets (training sets only) that were used for the fine-tuning of the vision-language model Idefics2.\n\n\t\n\t\t\n\t\n\t\n\t\tLoad the dataset\n\t\n\nTo load the dataset, install the library datasets with pip install datasets. Then,\nfrom datasets import load_dataset\nds = load_dataset(\"HuggingFaceM4/the_cauldron\", \"ai2d\")\n\nto download… See the full description on the dataset page: https://huggingface.co/datasets/HuggingFaceM4/the_cauldron.",
"creator": {
"@type": "sc:Organization",
"name": "HuggingFaceM4",
"url": "https://huggingface.co/HuggingFaceM4"
},
"keywords": [
"1M - 10M",
"parquet",
"Image",
"Text",
"Datasets",
"Dask",
"Croissant",
"Polars",
"arxiv:1603.07396",
"arxiv:2206.01718",
"arxiv:2208.05358",
"arxiv:1612.06890",
"arxiv:2310.00367",
"arxiv:1710.07300",
"arxiv:2312.12241",
"arxiv:1912.03098",
"arxiv:2211.08545",
"arxiv:2306.05425",
"arxiv:1709.00103",
"arxiv:2003.12462",
"arxiv:1612.00837",
"arxiv:2205.00363",
"arxiv:2403.09029",
"arxiv:2405.02246",
"🇺🇸 Region: US"
],
"url": "https://huggingface.co/datasets/HuggingFaceM4/the_cauldron"
}
3 changes: 3 additions & 0 deletions datasets/1.0/huggingface-the-cauldron/output/ai2d.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"ai2d/images": "[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=299x227 at <MEMORY_ADDRESS>>]"}
{"ai2d/images": "[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=715x517 at <MEMORY_ADDRESS>>]"}
{"ai2d/images": "[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=32<MEMORY_ADDRESS> at <MEMORY_ADDRESS>>]"}
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def apply_transforms_fn(value: Any, field: Field, repeated: bool = False) -> Any
return value
transforms = source.transforms
for transform in transforms:
if repeated and isinstance(value, list):
if repeated and isinstance(value, (list, np.ndarray)):
value = [_apply_transform_fn(v, transform, field) for v in value]
else:
value = _apply_transform_fn(value, transform, field)
Expand Down
10 changes: 5 additions & 5 deletions python/mlcroissant/recipes/tfds_croissant_builder.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@
" image = image.view(image.size()[0], -1).to(torch.float32)\n",
" return self.classifier(image)\n",
"\n",
"shape = train[0][\"fashion_mnist/image\"].shape\n",
"shape = train[0][\"image\"].shape\n",
"num_classes = 10\n",
"model = LinearClassifier(shape, num_classes)\n",
"optimizer = torch.optim.Adam(model.parameters())\n",
Expand All @@ -531,8 +531,8 @@
"print('Training...')\n",
"model.train()\n",
"for example in tqdm(train_loader):\n",
" image = example['fashion_mnist/image']\n",
" label = example['fashion_mnist/label']\n",
" image = example['image']\n",
" label = example['label']\n",
" prediction = model(image)\n",
" loss = loss_function(prediction, label)\n",
" optimizer.zero_grad()\n",
Expand All @@ -544,8 +544,8 @@
"num_examples = 0\n",
"true_positives = 0\n",
"for example in tqdm(test_loader):\n",
" image = example['fashion_mnist/image']\n",
" label = example['fashion_mnist/label']\n",
" image = example['image']\n",
" label = example['label']\n",
" prediction = model(image)\n",
" num_examples += image.shape[0]\n",
" predicted_label = prediction.argmax(dim=1)\n",
Expand Down
Loading