diff --git a/datasets/0.8/audio_test/metadata.json b/datasets/0.8/audio_test/metadata.json index c777bf9bd..ee2c7e999 100644 --- a/datasets/0.8/audio_test/metadata.json +++ b/datasets/0.8/audio_test/metadata.json @@ -40,7 +40,7 @@ "@type": "sc:Dataset", "name": "audio_test", "description": "This is the basic test case for audio files", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "distribution": [ { diff --git a/datasets/0.8/bigcode-the-stack/metadata.json b/datasets/0.8/bigcode-the-stack/metadata.json index 2e771cc15..c3a004f52 100644 --- a/datasets/0.8/bigcode-the-stack/metadata.json +++ b/datasets/0.8/bigcode-the-stack/metadata.json @@ -41,7 +41,7 @@ "description": "The Stack contains over 6TB of permissively-licensed source code files covering 358 programming languages. The dataset was created as part of the BigCode Project, an open scientific collaboration working on the responsible development of Large Language Models for Code (Code LLMs). The Stack serves as a pre-training dataset for Code LLMs, i.e., code-generating AI systems which enable the synthesis of programs from natural language descriptions as well as other from code snippets.", "citation": "@article{Kocetkov2022TheStack, title={The Stack: 3 TB of permissively licensed source code}, author={Kocetkov, Denis and Li, Raymond and Ben Allal, Loubna and Li, Jia and Mou,Chenghao and Mu\u00f1oz Ferrandis, Carlos and Jernite, Yacine and Mitchell, Margaret and Hughes, Sean and Wolf, Thomas and Bahdanau, Dzmitry and von Werra, Leandro and de Vries, Harm}, journal={Preprint}, year={2022} }", "license": "other", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/bigcode/the-stack", "distribution": [ { diff --git a/datasets/0.8/coco2014-mini/metadata.json b/datasets/0.8/coco2014-mini/metadata.json index b32da4723..dc5847aea 100644 --- a/datasets/0.8/coco2014-mini/metadata.json +++ b/datasets/0.8/coco2014-mini/metadata.json @@ -42,7 +42,7 @@ "description": "Smaller downloadable version of COCO to be used in unit tests.", "citation": "None", "license": "cc-by-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "version": "1.0.0", "distribution": [ diff --git a/datasets/0.8/coco2014/metadata.json b/datasets/0.8/coco2014/metadata.json index 94ceb530c..2007832a7 100644 --- a/datasets/0.8/coco2014/metadata.json +++ b/datasets/0.8/coco2014/metadata.json @@ -42,7 +42,7 @@ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.", "citation": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}", "license": "cc-by-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://cocodataset.org/", "distribution": [ { diff --git a/datasets/0.8/fashion-mnist/metadata.json b/datasets/0.8/fashion-mnist/metadata.json index 0683c78c4..e79d99bad 100644 --- a/datasets/0.8/fashion-mnist/metadata.json +++ b/datasets/0.8/fashion-mnist/metadata.json @@ -41,7 +41,7 @@ "description": "Fashion-MNIST is a dataset of Zalando's article images\u2014consisting of a training set of\n60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image,\nassociated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in\nreplacement for the original MNIST dataset for benchmarking machine learning algorithms.\nIt shares the same image size and structure of training and testing splits.\n", "citation": "@article{DBLP:journals/corr/abs-1708-07747,\n author = {Han Xiao and\n Kashif Rasul and\n Roland Vollgraf},\n title = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning\n Algorithms},\n journal = {CoRR},\n volume = {abs/1708.07747},\n year = {2017},\n url = {http://arxiv.org/abs/1708.07747},\n archivePrefix = {arXiv},\n eprint = {1708.07747},\n timestamp = {Mon, 13 Aug 2018 16:47:27 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/abs-1708-07747},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "license": "mit", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/fashion_mnist", "distribution": [ { diff --git a/datasets/0.8/flores-200/metadata.json b/datasets/0.8/flores-200/metadata.json index 538cc12fa..bbd74a8d6 100644 --- a/datasets/0.8/flores-200/metadata.json +++ b/datasets/0.8/flores-200/metadata.json @@ -45,7 +45,7 @@ "@inproceedings{twoeval, title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, author={Guzm\\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, journal={arXiv preprint arXiv:1902.01382}, year={2019}}" ], "license": "cc-by-sa-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/facebookresearch/flores", "version": "0.0.1", "distribution": [ diff --git a/datasets/0.8/gpt-3/metadata.json b/datasets/0.8/gpt-3/metadata.json index c0e4b9885..6656dd570 100644 --- a/datasets/0.8/gpt-3/metadata.json +++ b/datasets/0.8/gpt-3/metadata.json @@ -41,7 +41,7 @@ "description": "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions \u2013 something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.", "citation": "@article{brown2020language, title={Language Models are Few-Shot Learners}, author={Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei}, year={2020}, eprint={2005.14165}, archivePrefix={arXiv}, primaryClass={cs.CL} }", "license": "unknown", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/openai/gpt-3", "distribution": [ { diff --git a/datasets/0.8/huggingface-c4/metadata.json b/datasets/0.8/huggingface-c4/metadata.json index 80eceda61..f3f3f2c41 100644 --- a/datasets/0.8/huggingface-c4/metadata.json +++ b/datasets/0.8/huggingface-c4/metadata.json @@ -41,7 +41,7 @@ "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "citation": "\n@article{2019t5,\n author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},\n title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},\n journal = {arXiv e-prints},\n year = {2019},\n archivePrefix = {arXiv},\n eprint = {1910.10683},\n}\n", "license": "odc-by", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/c4", "version": "0.0.0", "distribution": [ diff --git a/datasets/0.8/huggingface-mnist/metadata.json b/datasets/0.8/huggingface-mnist/metadata.json index 25f124f1d..5b491960b 100644 --- a/datasets/0.8/huggingface-mnist/metadata.json +++ b/datasets/0.8/huggingface-mnist/metadata.json @@ -41,7 +41,7 @@ "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.\n", "citation": "@article{lecun2010mnist,\n title={MNIST handwritten digit database},\n author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},\n volume={2},\n year={2010}\n}\n", "license": "mit", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/mnist", "version": "1.0.0", "distribution": [ diff --git a/datasets/0.8/movielens/metadata.json b/datasets/0.8/movielens/metadata.json index 727225624..a70b01dc3 100644 --- a/datasets/0.8/movielens/metadata.json +++ b/datasets/0.8/movielens/metadata.json @@ -39,7 +39,7 @@ "@type": "sc:Dataset", "name": "Movielens-25M", "description": "MovieLens 25M movie ratings. Stable benchmark dataset. 25 million ratings and one million tag applications applied to 62,000 movies by 162,000 users. Includes tag genome data with 15 million relevance scores across 1,129 tags. Released 12/2019", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://grouplens.org/datasets/movielens/25m/", "distribution": [ { diff --git a/datasets/0.8/pass-mini/metadata.json b/datasets/0.8/pass-mini/metadata.json index dd4edf784..f66a0f40d 100755 --- a/datasets/0.8/pass-mini/metadata.json +++ b/datasets/0.8/pass-mini/metadata.json @@ -41,7 +41,7 @@ "description": "Smaller downloadable version of PASS to be used in unit tests.", "citation": "None", "license": "None", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "distribution": [ { diff --git a/datasets/0.8/pass/metadata.json b/datasets/0.8/pass/metadata.json index d5aa4a1c1..1feb13325 100755 --- a/datasets/0.8/pass/metadata.json +++ b/datasets/0.8/pass/metadata.json @@ -41,7 +41,7 @@ "description": "PASS is a large-scale image dataset that does not include any humans and which can be used for high-quality pretraining while significantly reducing privacy concerns.", "citation": "@Article{asano21pass, author = \"Yuki M. Asano and Christian Rupprecht and Andrew Zisserman and Andrea Vedaldi\", title = \"PASS: An ImageNet replacement for self-supervised pretraining without humans\", journal = \"NeurIPS Track on Datasets and Benchmarks\", year = \"2021\" }", "license": "cc-by-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.robots.ox.ac.uk/~vgg/data/pass/", "distribution": [ { diff --git a/datasets/0.8/recipes/compressed_archive.json b/datasets/0.8/recipes/compressed_archive.json index d890bee99..2015dcbb5 100644 --- a/datasets/0.8/recipes/compressed_archive.json +++ b/datasets/0.8/recipes/compressed_archive.json @@ -39,7 +39,7 @@ "@type": "sc:Dataset", "name": "compressed_archive_example", "description": "This is a fairly minimal example, showing a way to describe archive files.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/datasets/recipes/compressed_archive/about", "distribution": [ { diff --git a/datasets/0.8/recipes/enum.json b/datasets/0.8/recipes/enum.json index 436dca7ac..160c6ae5b 100644 --- a/datasets/0.8/recipes/enum.json +++ b/datasets/0.8/recipes/enum.json @@ -39,7 +39,7 @@ "@type": "sc:Dataset", "name": "enum_example", "description": "This is a fairly minimal example, showing a way to describe enumerations.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/datasets/enum/about", "distribution": [ { diff --git a/datasets/0.8/recipes/file_object_in_zip.json b/datasets/0.8/recipes/file_object_in_zip.json index 9d1a5fdfc..83b38c91e 100644 --- a/datasets/0.8/recipes/file_object_in_zip.json +++ b/datasets/0.8/recipes/file_object_in_zip.json @@ -39,7 +39,7 @@ "@type": "sc:Dataset", "name": "file_object_in_zip", "description": "Minimal example to read a FileObject contained in a zip.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/recipes/minimal.json b/datasets/0.8/recipes/minimal.json index ad9d3cab1..b1993bc41 100644 --- a/datasets/0.8/recipes/minimal.json +++ b/datasets/0.8/recipes/minimal.json @@ -39,6 +39,6 @@ "@type": "sc:Dataset", "name": "minimal_example", "description": "This is a very minimal example, with only the required fields.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/dataset/minimal/about" } diff --git a/datasets/0.8/recipes/minimal_recommended.json b/datasets/0.8/recipes/minimal_recommended.json index e493ab22c..f90e506b6 100644 --- a/datasets/0.8/recipes/minimal_recommended.json +++ b/datasets/0.8/recipes/minimal_recommended.json @@ -40,7 +40,7 @@ "name": "minimal_example_with_recommended_fields", "description": "This is a minimal example, including the required and the recommended fields.", "license": "https://creativecommons.org/licenses/by/4.0/", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/dataset/recipes/minimal-recommended", "distribution": [ { diff --git a/datasets/0.8/recipes/read_binary_file_by_line.json b/datasets/0.8/recipes/read_binary_file_by_line.json index eed64e7b0..27af1bde3 100644 --- a/datasets/0.8/recipes/read_binary_file_by_line.json +++ b/datasets/0.8/recipes/read_binary_file_by_line.json @@ -39,7 +39,7 @@ "@type": "sc:Dataset", "name": "read_binary_file_by_line", "description": "This is a recipe illustrating how to read files line by line.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/recipes/read_from_directory.json b/datasets/0.8/recipes/read_from_directory.json index 56b6dbfa1..38b08f30e 100644 --- a/datasets/0.8/recipes/read_from_directory.json +++ b/datasets/0.8/recipes/read_from_directory.json @@ -40,7 +40,7 @@ "@type": "sc:Dataset", "name": "read_from_directory", "description": "Minimal example showing how to read from local directories.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/mlcommons/croissant", "distribution": [ { diff --git a/datasets/0.8/recipes/read_from_tar.json b/datasets/0.8/recipes/read_from_tar.json index c7d77a841..89f306b96 100644 --- a/datasets/0.8/recipes/read_from_tar.json +++ b/datasets/0.8/recipes/read_from_tar.json @@ -39,7 +39,7 @@ "@type": "sc:Dataset", "name": "read_from_tar", "description": "Example dataset to read several FileSets from a tar.gz and join them.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/recipes/simple-split.json b/datasets/0.8/recipes/simple-split.json index 924a336ce..1e93d84db 100644 --- a/datasets/0.8/recipes/simple-split.json +++ b/datasets/0.8/recipes/simple-split.json @@ -41,7 +41,7 @@ "name": "simple-split", "description": "An artificial example dataset defining splits from a CSV column", "license": "https://creativecommons.org/licenses/by/4.0/", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/simple-join/metadata.json b/datasets/0.8/simple-join/metadata.json index d3d986be8..fc44a1e42 100644 --- a/datasets/0.8/simple-join/metadata.json +++ b/datasets/0.8/simple-join/metadata.json @@ -40,7 +40,7 @@ "name": "simple-join", "description": "Example to showcase the use of join.", "license": "https://creativecommons.org/licenses/by/4.0/", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/simple-parquet/metadata.json b/datasets/0.8/simple-parquet/metadata.json index 77f178efc..15ff43f8e 100644 --- a/datasets/0.8/simple-parquet/metadata.json +++ b/datasets/0.8/simple-parquet/metadata.json @@ -40,7 +40,7 @@ "name": "simple-parquet", "description": "Example to read Parquet files.", "license": "https://creativecommons.org/licenses/by/4.0/", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/0.8/titanic/metadata.json b/datasets/0.8/titanic/metadata.json index 6cb480dc2..3468faecd 100644 --- a/datasets/0.8/titanic/metadata.json +++ b/datasets/0.8/titanic/metadata.json @@ -42,7 +42,7 @@ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "afl-3.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.openml.org/d/40945", "version": "1.0.0", "distribution": [ diff --git a/datasets/0.8/wiki-text/metadata.json b/datasets/0.8/wiki-text/metadata.json index d6710f9a3..aad22e37a 100644 --- a/datasets/0.8/wiki-text/metadata.json +++ b/datasets/0.8/wiki-text/metadata.json @@ -42,7 +42,7 @@ "description": "The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.\n\nCompared to the preprocessed version of Penn Treebank (PTB), WikiText-2 is over 2 times larger and WikiText-103 is over 110 times larger. The WikiText dataset also features a far larger vocabulary and retains the original case, punctuation and numbers - all of which are removed in PTB. As it is composed of full articles, the dataset is well suited for models that can take advantage of long term dependencies.", "citation": "@article{merity2016pointer, title={Pointer sentinel mixture models}, author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, journal={arXiv preprint arXiv:1609.07843}, year={2016} }", "license": "cc-by-sa-3.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/", "distribution": [ { diff --git a/datasets/0.8/world-happiness/metadata.json b/datasets/0.8/world-happiness/metadata.json index 431ae9912..08de0472f 100644 --- a/datasets/0.8/world-happiness/metadata.json +++ b/datasets/0.8/world-happiness/metadata.json @@ -41,7 +41,7 @@ "description": "Happiness scored according to economic production, social support, etc.", "citation": "None", "license": "cc0-1.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.kaggle.com/datasets/unsdsn/world-happiness", "distribution": [ { diff --git a/datasets/1.0/audio_test/metadata.json b/datasets/1.0/audio_test/metadata.json index 7aaa944d6..e2cbf9637 100644 --- a/datasets/1.0/audio_test/metadata.json +++ b/datasets/1.0/audio_test/metadata.json @@ -48,7 +48,7 @@ "name": "audio_test", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is the basic test case for audio files", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "distribution": [ { diff --git a/datasets/1.0/bigcode-the-stack/metadata.json b/datasets/1.0/bigcode-the-stack/metadata.json index 5c51d4942..ece9b34bb 100644 --- a/datasets/1.0/bigcode-the-stack/metadata.json +++ b/datasets/1.0/bigcode-the-stack/metadata.json @@ -59,7 +59,7 @@ ], "citeAs": "@article{Kocetkov2022TheStack, title={The Stack: 3 TB of permissively licensed source code}, author={Kocetkov, Denis and Li, Raymond and Ben Allal, Loubna and Li, Jia and Mou,Chenghao and Mu\u00f1oz Ferrandis, Carlos and Jernite, Yacine and Mitchell, Margaret and Hughes, Sean and Wolf, Thomas and Bahdanau, Dzmitry and von Werra, Leandro and de Vries, Harm}, journal={Preprint}, year={2022} }", "license": "other", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/bigcode/the-stack", "distribution": [ { diff --git a/datasets/1.0/coco2014-mini/metadata.json b/datasets/1.0/coco2014-mini/metadata.json index 4fd34f22a..47c7a32e6 100644 --- a/datasets/1.0/coco2014-mini/metadata.json +++ b/datasets/1.0/coco2014-mini/metadata.json @@ -50,7 +50,7 @@ "description": "Smaller downloadable version of COCO to be used in unit tests.", "citeAs": "None", "license": "cc-by-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "version": "1.0.0", "distribution": [ diff --git a/datasets/1.0/coco2014/metadata.json b/datasets/1.0/coco2014/metadata.json index 92d8ca07b..479674959 100644 --- a/datasets/1.0/coco2014/metadata.json +++ b/datasets/1.0/coco2014/metadata.json @@ -50,7 +50,7 @@ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.", "citeAs": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}", "license": "cc-by-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://cocodataset.org/", "distribution": [ { diff --git a/datasets/1.0/fashion-mnist/metadata.json b/datasets/1.0/fashion-mnist/metadata.json index ed09efdde..c327ad2b8 100644 --- a/datasets/1.0/fashion-mnist/metadata.json +++ b/datasets/1.0/fashion-mnist/metadata.json @@ -49,7 +49,7 @@ "description": "Fashion-MNIST is a dataset of Zalando's article images\u2014consisting of a training set of\n60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image,\nassociated with a label from 10 classes. We intend Fashion-MNIST to serve as a direct drop-in\nreplacement for the original MNIST dataset for benchmarking machine learning algorithms.\nIt shares the same image size and structure of training and testing splits.\n", "citeAs": "@article{DBLP:journals/corr/abs-1708-07747,\n author = {Han Xiao and\n Kashif Rasul and\n Roland Vollgraf},\n title = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning\n Algorithms},\n journal = {CoRR},\n volume = {abs/1708.07747},\n year = {2017},\n url = {http://arxiv.org/abs/1708.07747},\n archivePrefix = {arXiv},\n eprint = {1708.07747},\n timestamp = {Mon, 13 Aug 2018 16:47:27 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/abs-1708-07747},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n", "license": "mit", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/fashion_mnist", "distribution": [ { diff --git a/datasets/1.0/flores-200/metadata.json b/datasets/1.0/flores-200/metadata.json index 322baad14..64c6bde5e 100644 --- a/datasets/1.0/flores-200/metadata.json +++ b/datasets/1.0/flores-200/metadata.json @@ -53,7 +53,7 @@ "@inproceedings{twoeval, title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, author={Guzm\\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, journal={arXiv preprint arXiv:1902.01382}, year={2019}}" ], "license": "cc-by-sa-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/facebookresearch/flores", "version": "0.0.1", "distribution": [ diff --git a/datasets/1.0/gpt-3/metadata.json b/datasets/1.0/gpt-3/metadata.json index 51119c377..df03dc2f1 100644 --- a/datasets/1.0/gpt-3/metadata.json +++ b/datasets/1.0/gpt-3/metadata.json @@ -49,7 +49,7 @@ "description": "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions \u2013 something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.", "citeAs": "@article{brown2020language, title={Language Models are Few-Shot Learners}, author={Tom B. Brown and Benjamin Mann and Nick Ryder and Melanie Subbiah and Jared Kaplan and Prafulla Dhariwal and Arvind Neelakantan and Pranav Shyam and Girish Sastry and Amanda Askell and Sandhini Agarwal and Ariel Herbert-Voss and Gretchen Krueger and Tom Henighan and Rewon Child and Aditya Ramesh and Daniel M. Ziegler and Jeffrey Wu and Clemens Winter and Christopher Hesse and Mark Chen and Eric Sigler and Mateusz Litwin and Scott Gray and Benjamin Chess and Jack Clark and Christopher Berner and Sam McCandlish and Alec Radford and Ilya Sutskever and Dario Amodei}, year={2020}, eprint={2005.14165}, archivePrefix={arXiv}, primaryClass={cs.CL} }", "license": "unknown", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/openai/gpt-3", "distribution": [ { diff --git a/datasets/1.0/huggingface-c4/metadata.json b/datasets/1.0/huggingface-c4/metadata.json index 2810be904..e5e7ce420 100644 --- a/datasets/1.0/huggingface-c4/metadata.json +++ b/datasets/1.0/huggingface-c4/metadata.json @@ -49,7 +49,7 @@ "description": "A colossal, cleaned version of Common Crawl's web crawl corpus.\n\nBased on Common Crawl dataset: \"https://commoncrawl.org\".\n\nThis is the processed version of Google's C4 dataset by AllenAI.\n", "citeAs": "\n@article{2019t5,\n author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},\n title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},\n journal = {arXiv e-prints},\n year = {2019},\n archivePrefix = {arXiv},\n eprint = {1910.10683},\n}\n", "license": "odc-by", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/c4", "version": "0.0.0", "distribution": [ diff --git a/datasets/1.0/huggingface-mnist/metadata.json b/datasets/1.0/huggingface-mnist/metadata.json index b7531dbee..ce6a532c6 100644 --- a/datasets/1.0/huggingface-mnist/metadata.json +++ b/datasets/1.0/huggingface-mnist/metadata.json @@ -49,7 +49,7 @@ "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.\n", "citeAs": "@article{lecun2010mnist,\n title={MNIST handwritten digit database},\n author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},\n volume={2},\n year={2010}\n}\n", "license": "mit", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://huggingface.co/datasets/mnist", "version": "1.0.0", "distribution": [ diff --git a/datasets/1.0/movielens/metadata.json b/datasets/1.0/movielens/metadata.json index 61dfda9c4..b9b9f2284 100644 --- a/datasets/1.0/movielens/metadata.json +++ b/datasets/1.0/movielens/metadata.json @@ -47,7 +47,7 @@ "name": "Movielens-25M", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "MovieLens 25M movie ratings. Stable benchmark dataset. 25 million ratings and one million tag applications applied to 62,000 movies by 162,000 users. Includes tag genome data with 15 million relevance scores across 1,129 tags. Released 12/2019", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://grouplens.org/datasets/movielens/25m/", "distribution": [ { diff --git a/datasets/1.0/pass-mini/metadata.json b/datasets/1.0/pass-mini/metadata.json index ea9a935cf..d90bd6642 100755 --- a/datasets/1.0/pass-mini/metadata.json +++ b/datasets/1.0/pass-mini/metadata.json @@ -49,7 +49,7 @@ "description": "Smaller downloadable version of PASS to be used in unit tests.", "citeAs": "None", "license": "None", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "None", "distribution": [ { diff --git a/datasets/1.0/pass/metadata.json b/datasets/1.0/pass/metadata.json index e3ff6b080..71c78fc4f 100755 --- a/datasets/1.0/pass/metadata.json +++ b/datasets/1.0/pass/metadata.json @@ -49,7 +49,7 @@ "description": "PASS is a large-scale image dataset that does not include any humans and which can be used for high-quality pretraining while significantly reducing privacy concerns.", "citeAs": "@Article{asano21pass, author = \"Yuki M. Asano and Christian Rupprecht and Andrew Zisserman and Andrea Vedaldi\", title = \"PASS: An ImageNet replacement for self-supervised pretraining without humans\", journal = \"NeurIPS Track on Datasets and Benchmarks\", year = \"2021\" }", "license": "cc-by-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.robots.ox.ac.uk/~vgg/data/pass/", "distribution": [ { diff --git a/datasets/1.0/recipes/compressed_archive.json b/datasets/1.0/recipes/compressed_archive.json index e365f2b32..93ab6d778 100644 --- a/datasets/1.0/recipes/compressed_archive.json +++ b/datasets/1.0/recipes/compressed_archive.json @@ -47,7 +47,7 @@ "name": "compressed_archive_example", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a fairly minimal example, showing a way to describe archive files.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/datasets/recipes/compressed_archive/about", "distribution": [ { diff --git a/datasets/1.0/recipes/enum.json b/datasets/1.0/recipes/enum.json index e6ba7b1d2..10034c067 100644 --- a/datasets/1.0/recipes/enum.json +++ b/datasets/1.0/recipes/enum.json @@ -47,7 +47,7 @@ "name": "enum_example", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a fairly minimal example, showing a way to describe enumerations.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/datasets/enum/about", "distribution": [ { diff --git a/datasets/1.0/recipes/file_object_in_zip.json b/datasets/1.0/recipes/file_object_in_zip.json index 981378436..de51e5b12 100644 --- a/datasets/1.0/recipes/file_object_in_zip.json +++ b/datasets/1.0/recipes/file_object_in_zip.json @@ -47,7 +47,7 @@ "name": "file_object_in_zip", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Minimal example to read a FileObject contained in a zip.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/recipes/minimal.json b/datasets/1.0/recipes/minimal.json index e2a5710e8..968378c19 100644 --- a/datasets/1.0/recipes/minimal.json +++ b/datasets/1.0/recipes/minimal.json @@ -47,6 +47,6 @@ "name": "minimal_example", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a very minimal example, with only the required fields.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/dataset/minimal/about" } diff --git a/datasets/1.0/recipes/minimal_recommended.json b/datasets/1.0/recipes/minimal_recommended.json index 7f3e8c107..3b6eabe7f 100644 --- a/datasets/1.0/recipes/minimal_recommended.json +++ b/datasets/1.0/recipes/minimal_recommended.json @@ -48,7 +48,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a minimal example, including the required and the recommended fields.", "license": "https://creativecommons.org/licenses/by/4.0/", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://example.com/dataset/recipes/minimal-recommended", "distribution": [ { diff --git a/datasets/1.0/recipes/read_binary_file_by_line.json b/datasets/1.0/recipes/read_binary_file_by_line.json index b92819569..a201624f3 100644 --- a/datasets/1.0/recipes/read_binary_file_by_line.json +++ b/datasets/1.0/recipes/read_binary_file_by_line.json @@ -47,7 +47,7 @@ "name": "read_binary_file_by_line", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "This is a recipe illustrating how to read files line by line.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/recipes/read_from_directory.json b/datasets/1.0/recipes/read_from_directory.json index 7ca13db76..682885adb 100644 --- a/datasets/1.0/recipes/read_from_directory.json +++ b/datasets/1.0/recipes/read_from_directory.json @@ -48,7 +48,7 @@ "name": "read_from_directory", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Minimal example showing how to read from local directories.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://github.com/mlcommons/croissant", "distribution": [ { diff --git a/datasets/1.0/recipes/read_from_tar.json b/datasets/1.0/recipes/read_from_tar.json index ebf50059a..233286502 100644 --- a/datasets/1.0/recipes/read_from_tar.json +++ b/datasets/1.0/recipes/read_from_tar.json @@ -47,7 +47,7 @@ "name": "read_from_tar", "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Example dataset to read several FileSets from a tar.gz and join them.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/recipes/simple-split.json b/datasets/1.0/recipes/simple-split.json index 7bb4f4de3..8671ff06f 100644 --- a/datasets/1.0/recipes/simple-split.json +++ b/datasets/1.0/recipes/simple-split.json @@ -49,7 +49,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "An artificial example dataset defining splits from a CSV column", "license": "https://creativecommons.org/licenses/by/4.0/", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/simple-join/metadata.json b/datasets/1.0/simple-join/metadata.json index 3d1b02dbb..a83163f34 100644 --- a/datasets/1.0/simple-join/metadata.json +++ b/datasets/1.0/simple-join/metadata.json @@ -48,7 +48,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Example to showcase the use of join.", "license": "https://creativecommons.org/licenses/by/4.0/", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/simple-parquet/metadata.json b/datasets/1.0/simple-parquet/metadata.json index 4e626dd4b..107203ee1 100644 --- a/datasets/1.0/simple-parquet/metadata.json +++ b/datasets/1.0/simple-parquet/metadata.json @@ -48,7 +48,7 @@ "conformsTo": "http://mlcommons.org/croissant/1.0", "description": "Example to read Parquet files.", "license": "https://creativecommons.org/licenses/by/4.0/", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://mlcommons.org", "distribution": [ { diff --git a/datasets/1.0/titanic/metadata.json b/datasets/1.0/titanic/metadata.json index ba80baf71..9b190192b 100644 --- a/datasets/1.0/titanic/metadata.json +++ b/datasets/1.0/titanic/metadata.json @@ -50,7 +50,7 @@ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citeAs": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "afl-3.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.openml.org/d/40945", "version": "1.0.0", "distribution": [ diff --git a/datasets/1.0/wiki-text/metadata.json b/datasets/1.0/wiki-text/metadata.json index 5f4a70100..75cbaeb38 100644 --- a/datasets/1.0/wiki-text/metadata.json +++ b/datasets/1.0/wiki-text/metadata.json @@ -50,7 +50,7 @@ "description": "The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.\n\nCompared to the preprocessed version of Penn Treebank (PTB), WikiText-2 is over 2 times larger and WikiText-103 is over 110 times larger. The WikiText dataset also features a far larger vocabulary and retains the original case, punctuation and numbers - all of which are removed in PTB. As it is composed of full articles, the dataset is well suited for models that can take advantage of long term dependencies.", "citeAs": "@article{merity2016pointer, title={Pointer sentinel mixture models}, author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard}, journal={arXiv preprint arXiv:1609.07843}, year={2016} }", "license": "cc-by-sa-3.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/", "distribution": [ { diff --git a/datasets/1.0/world-happiness/metadata.json b/datasets/1.0/world-happiness/metadata.json index 2aea41f1d..693ba513a 100644 --- a/datasets/1.0/world-happiness/metadata.json +++ b/datasets/1.0/world-happiness/metadata.json @@ -49,7 +49,7 @@ "description": "Happiness scored according to economic production, social support, etc.", "citeAs": "None", "license": "cc0-1.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.kaggle.com/datasets/unsdsn/world-happiness", "distribution": [ { diff --git a/editor/cypress/fixtures/0.8/coco2014.json b/editor/cypress/fixtures/0.8/coco2014.json index 94ceb530c..2007832a7 100644 --- a/editor/cypress/fixtures/0.8/coco2014.json +++ b/editor/cypress/fixtures/0.8/coco2014.json @@ -42,7 +42,7 @@ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.", "citation": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}", "license": "cc-by-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://cocodataset.org/", "distribution": [ { diff --git a/editor/cypress/fixtures/0.8/titanic.json b/editor/cypress/fixtures/0.8/titanic.json index 6cb480dc2..3468faecd 100644 --- a/editor/cypress/fixtures/0.8/titanic.json +++ b/editor/cypress/fixtures/0.8/titanic.json @@ -42,7 +42,7 @@ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citation": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "afl-3.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.openml.org/d/40945", "version": "1.0.0", "distribution": [ diff --git a/editor/cypress/fixtures/1.0/coco2014.json b/editor/cypress/fixtures/1.0/coco2014.json index 92d8ca07b..479674959 100644 --- a/editor/cypress/fixtures/1.0/coco2014.json +++ b/editor/cypress/fixtures/1.0/coco2014.json @@ -50,7 +50,7 @@ "description": "COCO is a large-scale object detection, segmentation, and captioning dataset. WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.", "citeAs": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n author = {Tsung{-}Yi Lin and\n Michael Maire and\n Serge J. Belongie and\n Lubomir D. Bourdev and\n Ross B. Girshick and\n James Hays and\n Pietro Perona and\n Deva Ramanan and\n Piotr Doll{'{a}}r and\n C. Lawrence Zitnick},\n title = {Microsoft {COCO:} Common Objects in Context},\n journal = {CoRR},\n volume = {abs/1405.0312},\n year = {2014},\n url = {http://arxiv.org/abs/1405.0312},\n archivePrefix = {arXiv},\n eprint = {1405.0312},\n timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}", "license": "cc-by-4.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://cocodataset.org/", "distribution": [ { diff --git a/editor/cypress/fixtures/1.0/titanic.json b/editor/cypress/fixtures/1.0/titanic.json index ba80baf71..9b190192b 100644 --- a/editor/cypress/fixtures/1.0/titanic.json +++ b/editor/cypress/fixtures/1.0/titanic.json @@ -50,7 +50,7 @@ "description": "The original Titanic dataset, describing the status of individual passengers on the Titanic.\n\n The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. \n\n For more information about how this dataset was constructed: \nhttps://web.archive.org/web/20200802155940/http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt\n\nOther useful information (useful for prices description for example):\nhttp://campus.lakeforest.edu/frank/FILES/MLFfiles/Bio150/Titanic/TitanicMETA.pdf\n\n Also see the following article describing shortcomings of the dataset data:\nhttps://emma-stiefel.medium.com/plugging-holes-in-kaggles-titanic-dataset-an-introduction-to-combining-datasets-with-fuzzywuzzy-60a686699da7\n", "citeAs": "The principal source for data about Titanic passengers is the Encyclopedia Titanica (http://www.encyclopedia-titanica.org/). The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.\n\nThomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created.\n", "license": "afl-3.0", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.openml.org/d/40945", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_contained_in/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_contained_in/metadata.json index c19521fd9..2a0d998ae 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_contained_in/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_contained_in/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_type/metadata.json index e7dfda05a..948b0de7e 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_bad_type/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_name/metadata.json index 59129eabc..f969f6959 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_name/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_property_content_url/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_property_content_url/metadata.json index 46f1850c1..7b8a38dca 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_property_content_url/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/distribution_missing_property_content_url/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_bad_type/metadata.json index bcb22dc38..6ac1fc674 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_bad_type/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0" } diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_missing_property_name/metadata.json index a1101fae8..803aa6d42 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/metadata_missing_property_name/metadata.json @@ -50,7 +50,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0" } diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_source/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_source/metadata.json index a360a79a9..901a06504 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_source/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_source/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_type/metadata.json index feb519d9e..5b0abb3b4 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_bad_type/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_property_name/metadata.json index 7ef0467a9..2fe884eb3 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_property_name/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_source/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_source/metadata.json index 7ef83f18d..bdf821269 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_source/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/mlfield_missing_source/metadata.json @@ -50,7 +50,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_context_for_datatype/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_context_for_datatype/metadata.json index bc0f992cb..50981630c 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_context_for_datatype/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_context_for_datatype/metadata.json @@ -47,7 +47,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_property_name/metadata.json index f2b979ea5..495e02a13 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_missing_property_name/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_wrong_join/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_wrong_join/metadata.json index 43c60464f..07f48f2c4 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_wrong_join/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/0.8/recordset_wrong_join/metadata.json @@ -51,7 +51,7 @@ "citation": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_contained_in/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_contained_in/metadata.json index 02d6785af..5f4ed0f3b 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_contained_in/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_contained_in/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_type/metadata.json index 6cef36bcd..75a5d0f67 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_bad_type/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_name/metadata.json index d40cc500d..813daa3eb 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_name/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_property_content_url/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_property_content_url/metadata.json index ecbdd0f50..d78dbda47 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_property_content_url/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/distribution_missing_property_content_url/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_bad_type/metadata.json index be1206a33..113373e78 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_bad_type/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0" } diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_missing_property_name/metadata.json index 13e4b6012..ceec6344d 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/metadata_missing_property_name/metadata.json @@ -50,7 +50,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0" } diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_source/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_source/metadata.json index 605462924..6c72c4696 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_source/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_source/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_type/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_type/metadata.json index 0d5f5078d..a9bf7d7ca 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_type/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_bad_type/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_property_name/metadata.json index bb876f63c..46f79474a 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_property_name/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_source/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_source/metadata.json index fcd10bda6..ffd37ec7a 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_source/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/mlfield_missing_source/metadata.json @@ -50,7 +50,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_context_for_datatype/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_context_for_datatype/metadata.json index 05690683c..8b2dcfebc 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_context_for_datatype/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_context_for_datatype/metadata.json @@ -47,7 +47,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_property_name/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_property_name/metadata.json index 1826ae1f5..03ce19612 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_property_name/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_missing_property_name/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_wrong_join/metadata.json b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_wrong_join/metadata.json index 2b60da216..f6c0a4ac0 100644 --- a/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_wrong_join/metadata.json +++ b/python/mlcroissant/mlcroissant/_src/tests/graphs/1.0/recordset_wrong_join/metadata.json @@ -51,7 +51,7 @@ "citeAs": "This is a citation.", "datePublished": "1990-02-01", "license": "This is a license.", - "sdLicense": "apache-2.0", + "sdLicense": "https://www.apache.org/licenses/LICENSE-2.0", "url": "https://www.google.com/dataset", "version": "1.0.0", "distribution": [ diff --git a/python/mlcroissant/mlcroissant/scripts/migrations/previous/202423021900.py b/python/mlcroissant/mlcroissant/scripts/migrations/previous/202423021900.py index b1c834c00..f75e7ca09 100644 --- a/python/mlcroissant/mlcroissant/scripts/migrations/previous/202423021900.py +++ b/python/mlcroissant/mlcroissant/scripts/migrations/previous/202423021900.py @@ -3,5 +3,5 @@ def up(json_ld): """Up migration to add sdLicense to all included datasets.""" - json_ld["sdLicense"] = "apache-2.0" + json_ld["sdLicense"] = "https://www.apache.org/licenses/LICENSE-2.0" return json_ld