MigrateSource to the new dataclass_transform paradigm. (#590)

mlcommons · Mar 11, 2024 · 889143e · 889143e
1 parent 7cb999e
commit 889143e
Show file tree

Hide file tree

Showing 37 changed files with 678 additions and 428 deletions.
diff --git a/datasets/0.8/flores-200/metadata.json b/datasets/0.8/flores-200/metadata.json
@@ -39,11 +39,7 @@
   "@type": "sc:Dataset",
   "name": "FLORES-200",
   "description": "FLORES-200 is an evaluation benchmark for low-resource and multilingual machine translation",
-  "citation": [
-    "@article{nllb2022, author={NLLB Team, Marta R. Costa-juss\u00e0, James Cross, Onur \u00c7elebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi,  Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzm\u00e1n, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Jeff Wang}, title={No Language Left Behind: Scaling Human-Centered Machine Translation}, year = {2022}}",
-    "@inproceedings{flores101, title={The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\\'{a}n, Francisco and Fan, Angela}, year={2021}}",
-    "@inproceedings{twoeval, title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, author={Guzm\\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, journal={arXiv preprint arXiv:1902.01382}, year={2019}}"
-  ],
+  "citation": "@inproceedings{flores101, title={The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\\'{a}n, Francisco and Fan, Angela}, year={2021}}",
   "license": "cc-by-sa-4.0",
   "url": "https://github.com/facebookresearch/flores",
   "version": "0.0.1",

diff --git a/datasets/0.8/movielens/metadata.json b/datasets/0.8/movielens/metadata.json
@@ -125,7 +125,7 @@
           "name": "genre",
           "description": "A sequence of genres to which the rated movie belongs.",
           "dataType": "sc:Text",
-          "repeated": "true",
+          "repeated": true,
           "source": {
             "distribution": "movies-table",
             "extract": {

diff --git a/datasets/0.8/wiki-text/metadata.json b/datasets/0.8/wiki-text/metadata.json
@@ -112,7 +112,7 @@
           "name": "word",
           "description": "A word.",
           "dataType": "sc:Text",
-          "repeated": "true",
+          "repeated": true,
           "source": {
             "distribution": "token-files",
             "extract": {

diff --git a/datasets/1.0/flores-200/metadata.json b/datasets/1.0/flores-200/metadata.json
@@ -47,11 +47,7 @@
   "name": "FLORES-200",
   "conformsTo": "http://mlcommons.org/croissant/1.0",
   "description": "FLORES-200 is an evaluation benchmark for low-resource and multilingual machine translation",
-  "citeAs": [
-    "@article{nllb2022, author={NLLB Team, Marta R. Costa-juss\u00e0, James Cross, Onur \u00c7elebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi,  Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzm\u00e1n, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Jeff Wang}, title={No Language Left Behind: Scaling Human-Centered Machine Translation}, year = {2022}}",
-    "@inproceedings{flores101, title={The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\\'{a}n, Francisco and Fan, Angela}, year={2021}}",
-    "@inproceedings{twoeval, title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, author={Guzm\\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, journal={arXiv preprint arXiv:1902.01382}, year={2019}}"
-  ],
+  "citeAs": "@inproceedings{flores101, title={The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\\'{a}n, Francisco and Fan, Angela}, year={2021}}",
   "license": "cc-by-sa-4.0",
   "url": "https://github.com/facebookresearch/flores",
   "version": "0.0.1",

diff --git a/datasets/1.0/movielens/metadata.json b/datasets/1.0/movielens/metadata.json
@@ -160,7 +160,7 @@
           "name": "genre",
           "description": "A sequence of genres to which the rated movie belongs.",
           "dataType": "sc:Text",
-          "repeated": "true",
+          "repeated": true,
           "source": {
             "fileObject": {
               "@id": "movies-table"

diff --git a/datasets/1.0/wiki-text/metadata.json b/datasets/1.0/wiki-text/metadata.json
@@ -134,7 +134,7 @@
           "name": "word",
           "description": "A word.",
           "dataType": "sc:Text",
-          "repeated": "true",
+          "repeated": true,
           "source": {
             "fileSet": {
               "@id": "token-files"

diff --git a/editor/Makefile b/editor/Makefile
@@ -1,7 +1,9 @@
+current_dir := $(dir $(abspath $(firstword $(MAKEFILE_LIST))))
+
 black:
-	black \
+	docker run --rm --volume $(current_dir):/src --workdir /src pyfound/black:24.2.0 black \
 		--line-length 88 \
-		--preview \
+		--exclude '.*\/node_modules\/' \
 	.
 
 isort:

diff --git a/editor/core/record_sets.py b/editor/core/record_sets.py
@@ -18,8 +18,7 @@ def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[Recor
     fields = []
     for column, value in file.df.dtypes.items():
         source = mlc.Source(
-            id=file.id,
-            node_type="distribution",
+            distribution=file.id,
             extract=mlc.Extract(column=column),
         )
         field = Field(

diff --git a/editor/core/state.py b/editor/core/state.py
@@ -206,6 +206,8 @@ def rename_distribution(self, old_name: str, new_name: str):
         """Renames a resource by changing all the references to this resource."""
         # Update other resources:
         for i, resource in enumerate(self.distribution):
+            if resource.id == old_name:
+                self.distribution[i].id = new_name
             contained_in = resource.contained_in
             if contained_in and old_name in contained_in:
                 self.distribution[i].contained_in = [
@@ -217,55 +219,69 @@ def rename_distribution(self, old_name: str, new_name: str):
     def rename_record_set(self, old_name: str, new_name: str):
         """Renames a RecordSet by changing all the references to this RecordSet."""
         for i, record_set in enumerate(self.record_sets):
+            if record_set.id == old_name:
+                self.record_sets[i].id = new_name
             for j, field in enumerate(record_set.fields):
                 possible_uuid = f"{old_name}/"
                 # Update source
                 source = field.source
-                if (
-                    source
-                    and source.id
-                    and (source.id.startswith(possible_uuid) or source.id == old_name)
-                ):
-                    new_uuid = source.id.replace(old_name, new_name, 1)
-                    self.record_sets[i].fields[j].source.id = new_uuid
+                if source and source.field and source.field.startswith(possible_uuid):
+                    new_uuid = source.field.replace(old_name, new_name, 1)
+                    self.record_sets[i].fields[j].source.field = new_uuid
+                if source and source.file_object and source.file_object == old_name:
+                    self.record_sets[i].fields[j].source.file_object = new_name
+                if source and source.file_set and source.file_set == old_name:
+                    self.record_sets[i].fields[j].source.file_set = new_name
+                if source and source.distribution and source.distribution == old_name:
+                    self.record_sets[i].fields[j].source.distribution = new_name
                 # Update references
                 references = field.references
                 if (
                     references
-                    and references.id
-                    and (
-                        references.id.startswith(possible_uuid)
-                        or references.id == old_name
-                    )
+                    and references.field
+                    and references.field.startswith(possible_uuid)
                 ):
-                    new_uuid = references.id.replace(old_name, new_name, 1)
-                    self.record_sets[i].fields[j].references.id = new_uuid
+                    new_uuid = references.field.replace(old_name, new_name, 1)
+                    self.record_sets[i].fields[j].references.field = new_uuid
+                if (
+                    references
+                    and references.file_object
+                    and references.file_object == old_name
+                ):
+                    self.record_sets[i].fields[j].references.file_object = new_name
+                if (
+                    references
+                    and references.file_set
+                    and references.file_set == old_name
+                ):
+                    self.record_sets[i].fields[j].references.file_set = new_name
+                if (
+                    references
+                    and references.distribution
+                    and references.distribution == old_name
+                ):
+                    self.record_sets[i].fields[j].references.distribution = new_name
 
     def rename_field(self, old_name: str, new_name: str):
         """Renames a field by changing all the references to this field."""
         for i, record_set in enumerate(self.record_sets):
             for j, field in enumerate(record_set.fields):
+                possible_uuid = f"/{old_name}"
                 # Update source
                 source = field.source
                 # The difference with RecordSet is the `.endswith` here:
-                if (
-                    source
-                    and source.id
-                    and "/" in source.id
-                    and source.id.endswith(old_name)
-                ):
-                    new_uuid = source.id.replace(old_name, new_name, 1)
-                    self.record_sets[i].fields[j].source.id = new_uuid
+                if source and source.field and source.field.endswith(possible_uuid):
+                    new_uuid = source.field.replace(old_name, new_name, 1)
+                    self.record_sets[i].fields[j].source.field = new_uuid
                 # Update references
                 references = field.references
                 if (
                     references
-                    and references.id
-                    and "/" in references.id
-                    and references.id.endswith(old_name)
+                    and references.field
+                    and references.field.endswith(possible_uuid)
                 ):
-                    new_uuid = references.id.replace(old_name, new_name, 1)
-                    self.record_sets[i].fields[j].references.id = new_uuid
+                    new_uuid = references.field.replace(old_name, new_name, 1)
+                    self.record_sets[i].fields[j].references.field = new_uuid
 
     def rename_id(self, old_id: str, new_id: str):
         for resource in self.distribution:
@@ -281,10 +297,11 @@ def rename_id(self, old_id: str, new_id: str):
             for field in record_set.fields:
                 if field.id == old_id:
                     field.id = new_id
-                if field.source and field.source.id == old_id:
-                    field.source.id = new_id
-                if field.references and field.references.id == old_id:
-                    field.references.id = new_id
+                for p in ["distribution", "field", "file_object", "file_set"]:
+                    if field.source and getattr(field.source, p) == old_id:
+                        setattr(field.source, p, new_id)
+                    if field.references and getattr(field.references, p) == old_id:
+                        setattr(field.references, p, new_id)
 
     def add_distribution(self, distribution: FileSet | FileObject) -> None:
         self.distribution.append(distribution)

diff --git a/editor/core/state_test.py b/editor/core/state_test.py
@@ -0,0 +1,32 @@
+"""Tests for state."""
+
+from etils import epath
+
+import mlcroissant as mlc
+
+from .state import Metadata
+
+
+def test_rename_record_set():
+    ctx = mlc.Context()
+    path = epath.Path(__file__).parent.parent / "cypress/fixtures/1.0/titanic.json"
+    canonical_metadata = mlc.Metadata.from_file(ctx, path)
+    metadata = Metadata.from_canonical(canonical_metadata)
+
+    # Rename RecordSet:
+    assert metadata.record_sets[0].id == "genders"
+    assert metadata.record_sets[2].fields[1].id == "passengers/gender"
+    assert metadata.record_sets[2].fields[1].references.field == "genders/label"
+    metadata.rename_record_set("genders", "NEW_GENDERS")
+    assert metadata.record_sets[0].id == "NEW_GENDERS"
+    assert metadata.record_sets[2].fields[1].references.field == "NEW_GENDERS/label"
+
+    # Rename Field:
+    metadata.rename_field("label", "NEW_LABEL")
+    assert metadata.record_sets[2].fields[1].references.field == "NEW_GENDERS/NEW_LABEL"
+
+    # Rename Distribution:
+    assert metadata.record_sets[2].fields[0].id == "passengers/name"
+    assert metadata.record_sets[2].fields[0].source.file_object == "passengers.csv"
+    metadata.rename_distribution("passengers.csv", "NEW_PASSENGERS.CSV")
+    assert metadata.record_sets[2].fields[0].source.file_object == "NEW_PASSENGERS.CSV"
diff --git a/editor/cypress/e2e/displayErrors.cy.js b/editor/cypress/e2e/displayErrors.cy.js
@@ -16,7 +16,7 @@ VERSIONS.forEach((version) => {
           fileName: fixture,
           mimeType: "text/json",
         };
-        cy.get("[data-testid='stFileUploadDropzone']").attachFile(file, {
+        cy.get("[data-testid='stFileUploader']").attachFile(file, {
           force: true,
           subjectType: "drag-n-drop",
           events: ["dragenter", "drop"],

diff --git a/editor/cypress/e2e/loadCroissant.cy.js b/editor/cypress/e2e/loadCroissant.cy.js
@@ -18,7 +18,7 @@ VERSIONS.forEach((version) => {
           fileName: fixture,
           mimeType: "text/json",
         };
-        cy.get("[data-testid='stFileUploadDropzone']").attachFile(file, {
+        cy.get("[data-testid='stFileUploader']").attachFile(file, {
           force: true,
           subjectType: "drag-n-drop",
           events: ["dragenter", "drop"],
@@ -39,7 +39,7 @@ VERSIONS.forEach((version) => {
           fileName: fixture,
           mimeType: "text/json",
         };
-        cy.get("[data-testid='stFileUploadDropzone']").attachFile(file, {
+        cy.get("[data-testid='stFileUploader']").attachFile(file, {
           force: true,
           subjectType: "drag-n-drop",
           events: ["dragenter", "drop"],

diff --git a/editor/cypress/e2e/renameDistribution.cy.js b/editor/cypress/e2e/renameDistribution.cy.js
@@ -17,7 +17,7 @@ VERSIONS.forEach((version) => {
           fileName: "titanic.json",
           mimeType: "text/json",
         };
-        cy.get("[data-testid='stFileUploadDropzone']").attachFile(file, {
+        cy.get("[data-testid='stFileUploader']").attachFile(file, {
           force: true,
           subjectType: "drag-n-drop",
           events: ["dragenter", "drop"],

diff --git a/editor/cypress/e2e/uploadCsv.cy.js b/editor/cypress/e2e/uploadCsv.cy.js
@@ -26,14 +26,14 @@ describe('Editor loads a local CSV as a resource', () => {
         fileName: 'base.csv', mimeType: 'text/csv',
       }
       cy.get(
-        "[data-testid='stFileUploadDropzone']",
+        "[data-testid='stFileUploader']",
       ).attachFile(file, {
         force: true,
         subjectType: "drag-n-drop",
         events: ["dragenter", "drop"],
       })
     })
-    cy.get('.uploadedFileData').contains('base.csv')
+    cy.get('.stFileUploaderFileData').contains('base.csv')
     cy.get('button').contains('Upload').click()
     // The file is uploaded, so we can click on it to see the details.
     // Waiting a few seconds to wait for the resource to download.

diff --git a/editor/events/fields.py b/editor/events/fields.py
@@ -98,8 +98,9 @@ def handle_field_change(
     elif change == FieldEvent.DATA_TYPE:
         field.data_types = [str_to_mlc_data_type(value)]
     elif change == FieldEvent.SOURCE:
-        node_type = "field" if "/" in value else "distribution"
-        source = mlc.Source(uid=value, node_type=node_type)
+        source = (
+            mlc.Source(field=value) if "/" in value else mlc.Source(file_object=value)
+        )
         field.source = source
     elif change == FieldEvent.SOURCE_EXTRACT:
         source = field.source
@@ -138,8 +139,9 @@ def handle_field_change(
         if number is not None and number < len(field.source.transforms):
             field.source.transforms[number] = mlc.Transform(separator=value)
     elif change == FieldEvent.REFERENCE:
-        node_type = "field" if "/" in value else "distribution"
-        source = mlc.Source(uid=value, node_type=node_type)
+        source = (
+            mlc.Source(field=value) if "/" in value else mlc.Source(file_object=value)
+        )
         field.references = source
     elif change == FieldEvent.REFERENCE_EXTRACT:
         source = field.references

diff --git a/editor/views/jsonld.py b/editor/views/jsonld.py
@@ -33,8 +33,7 @@ def render_jsonld():
                         description=field["description"],
                         data_types=field["data_type"],
                         source=mlc.Source(
-                            uid=file.name,
-                            node_type="distribution",
+                            distribution=file.name,
                             extract=mlc.Extract(column=field["name"]),
                         ),
                     )

diff --git a/editor/views/record_sets_test.py b/editor/views/record_sets_test.py
@@ -8,15 +8,17 @@ def test_find_joins():
         Field(
             id="field1",
             name="field1",
-            source=mlc.Source(id="some_csv", extract=mlc.Extract(column="some_column")),
-            references=mlc.Source(id="some_record_set/some_field"),
+            source=mlc.Source(
+                file_object="some_csv", extract=mlc.Extract(column="some_column")
+            ),
+            references=mlc.Source(field="some_record_set/some_field"),
         ),
-        Field(id="field2", name="field2", source=mlc.Source(id="foo/bar")),
+        Field(id="field2", name="field2", source=mlc.Source(field="foo/bar")),
         Field(
             id="field3",
             name="field3",
-            source=mlc.Source(id="some_record_set/some_field"),
-            references=mlc.Source(id="some_other_record_set/some_other_field"),
+            source=mlc.Source(field="some_record_set/some_field"),
+            references=mlc.Source(field="some_other_record_set/some_other_field"),
         ),
     ]
     assert _find_joins(fields) == set(

diff --git a/editor/views/source.py b/editor/views/source.py
@@ -140,7 +140,7 @@ def render_source(
         on_change=handle_field_change,
         args=(FieldEvent.SOURCE, field, key),
     )
-    if source.node_type == "distribution":
+    if source.distribution or source.file_object or source.file_set:
         extract = col2.selectbox(
             needed_field("Extract"),
             index=_get_extract_index(source),
@@ -310,7 +310,7 @@ def render_references(
             on_change=handle_field_change,
             args=(FieldEvent.REFERENCE, field, key),
         )
-        if references.node_type == "distribution":
+        if references.distribution or references.file_object or references.file_set:
             key = f"{key}-extract-references"
             extract = col2.selectbox(
                 needed_field("Extract the reference"),