Skip to content

Commit

Permalink
MigrateSource to the new dataclass_transform paradigm. (#590)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcenacp authored Mar 11, 2024
1 parent 7cb999e commit 889143e
Show file tree
Hide file tree
Showing 37 changed files with 678 additions and 428 deletions.
6 changes: 1 addition & 5 deletions datasets/0.8/flores-200/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,7 @@
"@type": "sc:Dataset",
"name": "FLORES-200",
"description": "FLORES-200 is an evaluation benchmark for low-resource and multilingual machine translation",
"citation": [
"@article{nllb2022, author={NLLB Team, Marta R. Costa-juss\u00e0, James Cross, Onur \u00c7elebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzm\u00e1n, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Jeff Wang}, title={No Language Left Behind: Scaling Human-Centered Machine Translation}, year = {2022}}",
"@inproceedings{flores101, title={The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\\'{a}n, Francisco and Fan, Angela}, year={2021}}",
"@inproceedings{twoeval, title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, author={Guzm\\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, journal={arXiv preprint arXiv:1902.01382}, year={2019}}"
],
"citation": "@inproceedings{flores101, title={The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\\'{a}n, Francisco and Fan, Angela}, year={2021}}",
"license": "cc-by-sa-4.0",
"url": "https://github.com/facebookresearch/flores",
"version": "0.0.1",
Expand Down
2 changes: 1 addition & 1 deletion datasets/0.8/movielens/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@
"name": "genre",
"description": "A sequence of genres to which the rated movie belongs.",
"dataType": "sc:Text",
"repeated": "true",
"repeated": true,
"source": {
"distribution": "movies-table",
"extract": {
Expand Down
2 changes: 1 addition & 1 deletion datasets/0.8/wiki-text/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@
"name": "word",
"description": "A word.",
"dataType": "sc:Text",
"repeated": "true",
"repeated": true,
"source": {
"distribution": "token-files",
"extract": {
Expand Down
6 changes: 1 addition & 5 deletions datasets/1.0/flores-200/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,7 @@
"name": "FLORES-200",
"conformsTo": "http://mlcommons.org/croissant/1.0",
"description": "FLORES-200 is an evaluation benchmark for low-resource and multilingual machine translation",
"citeAs": [
"@article{nllb2022, author={NLLB Team, Marta R. Costa-juss\u00e0, James Cross, Onur \u00c7elebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi, Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzm\u00e1n, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Jeff Wang}, title={No Language Left Behind: Scaling Human-Centered Machine Translation}, year = {2022}}",
"@inproceedings{flores101, title={The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\\'{a}n, Francisco and Fan, Angela}, year={2021}}",
"@inproceedings{twoeval, title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English}, author={Guzm\\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio}, journal={arXiv preprint arXiv:1902.01382}, year={2019}}"
],
"citeAs": "@inproceedings{flores101, title={The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation}, author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\\'{a}n, Francisco and Fan, Angela}, year={2021}}",
"license": "cc-by-sa-4.0",
"url": "https://github.com/facebookresearch/flores",
"version": "0.0.1",
Expand Down
2 changes: 1 addition & 1 deletion datasets/1.0/movielens/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@
"name": "genre",
"description": "A sequence of genres to which the rated movie belongs.",
"dataType": "sc:Text",
"repeated": "true",
"repeated": true,
"source": {
"fileObject": {
"@id": "movies-table"
Expand Down
2 changes: 1 addition & 1 deletion datasets/1.0/wiki-text/metadata.json
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@
"name": "word",
"description": "A word.",
"dataType": "sc:Text",
"repeated": "true",
"repeated": true,
"source": {
"fileSet": {
"@id": "token-files"
Expand Down
6 changes: 4 additions & 2 deletions editor/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
current_dir := $(dir $(abspath $(firstword $(MAKEFILE_LIST))))

black:
black \
docker run --rm --volume $(current_dir):/src --workdir /src pyfound/black:24.2.0 black \
--line-length 88 \
--preview \
--exclude '.*\/node_modules\/' \
.

isort:
Expand Down
3 changes: 1 addition & 2 deletions editor/core/record_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[Recor
fields = []
for column, value in file.df.dtypes.items():
source = mlc.Source(
id=file.id,
node_type="distribution",
distribution=file.id,
extract=mlc.Extract(column=column),
)
field = Field(
Expand Down
79 changes: 48 additions & 31 deletions editor/core/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ def rename_distribution(self, old_name: str, new_name: str):
"""Renames a resource by changing all the references to this resource."""
# Update other resources:
for i, resource in enumerate(self.distribution):
if resource.id == old_name:
self.distribution[i].id = new_name
contained_in = resource.contained_in
if contained_in and old_name in contained_in:
self.distribution[i].contained_in = [
Expand All @@ -217,55 +219,69 @@ def rename_distribution(self, old_name: str, new_name: str):
def rename_record_set(self, old_name: str, new_name: str):
"""Renames a RecordSet by changing all the references to this RecordSet."""
for i, record_set in enumerate(self.record_sets):
if record_set.id == old_name:
self.record_sets[i].id = new_name
for j, field in enumerate(record_set.fields):
possible_uuid = f"{old_name}/"
# Update source
source = field.source
if (
source
and source.id
and (source.id.startswith(possible_uuid) or source.id == old_name)
):
new_uuid = source.id.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].source.id = new_uuid
if source and source.field and source.field.startswith(possible_uuid):
new_uuid = source.field.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].source.field = new_uuid
if source and source.file_object and source.file_object == old_name:
self.record_sets[i].fields[j].source.file_object = new_name
if source and source.file_set and source.file_set == old_name:
self.record_sets[i].fields[j].source.file_set = new_name
if source and source.distribution and source.distribution == old_name:
self.record_sets[i].fields[j].source.distribution = new_name
# Update references
references = field.references
if (
references
and references.id
and (
references.id.startswith(possible_uuid)
or references.id == old_name
)
and references.field
and references.field.startswith(possible_uuid)
):
new_uuid = references.id.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].references.id = new_uuid
new_uuid = references.field.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].references.field = new_uuid
if (
references
and references.file_object
and references.file_object == old_name
):
self.record_sets[i].fields[j].references.file_object = new_name
if (
references
and references.file_set
and references.file_set == old_name
):
self.record_sets[i].fields[j].references.file_set = new_name
if (
references
and references.distribution
and references.distribution == old_name
):
self.record_sets[i].fields[j].references.distribution = new_name

def rename_field(self, old_name: str, new_name: str):
"""Renames a field by changing all the references to this field."""
for i, record_set in enumerate(self.record_sets):
for j, field in enumerate(record_set.fields):
possible_uuid = f"/{old_name}"
# Update source
source = field.source
# The difference with RecordSet is the `.endswith` here:
if (
source
and source.id
and "/" in source.id
and source.id.endswith(old_name)
):
new_uuid = source.id.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].source.id = new_uuid
if source and source.field and source.field.endswith(possible_uuid):
new_uuid = source.field.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].source.field = new_uuid
# Update references
references = field.references
if (
references
and references.id
and "/" in references.id
and references.id.endswith(old_name)
and references.field
and references.field.endswith(possible_uuid)
):
new_uuid = references.id.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].references.id = new_uuid
new_uuid = references.field.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].references.field = new_uuid

def rename_id(self, old_id: str, new_id: str):
for resource in self.distribution:
Expand All @@ -281,10 +297,11 @@ def rename_id(self, old_id: str, new_id: str):
for field in record_set.fields:
if field.id == old_id:
field.id = new_id
if field.source and field.source.id == old_id:
field.source.id = new_id
if field.references and field.references.id == old_id:
field.references.id = new_id
for p in ["distribution", "field", "file_object", "file_set"]:
if field.source and getattr(field.source, p) == old_id:
setattr(field.source, p, new_id)
if field.references and getattr(field.references, p) == old_id:
setattr(field.references, p, new_id)

def add_distribution(self, distribution: FileSet | FileObject) -> None:
self.distribution.append(distribution)
Expand Down
32 changes: 32 additions & 0 deletions editor/core/state_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Tests for state."""

from etils import epath

import mlcroissant as mlc

from .state import Metadata


def test_rename_record_set():
ctx = mlc.Context()
path = epath.Path(__file__).parent.parent / "cypress/fixtures/1.0/titanic.json"
canonical_metadata = mlc.Metadata.from_file(ctx, path)
metadata = Metadata.from_canonical(canonical_metadata)

# Rename RecordSet:
assert metadata.record_sets[0].id == "genders"
assert metadata.record_sets[2].fields[1].id == "passengers/gender"
assert metadata.record_sets[2].fields[1].references.field == "genders/label"
metadata.rename_record_set("genders", "NEW_GENDERS")
assert metadata.record_sets[0].id == "NEW_GENDERS"
assert metadata.record_sets[2].fields[1].references.field == "NEW_GENDERS/label"

# Rename Field:
metadata.rename_field("label", "NEW_LABEL")
assert metadata.record_sets[2].fields[1].references.field == "NEW_GENDERS/NEW_LABEL"

# Rename Distribution:
assert metadata.record_sets[2].fields[0].id == "passengers/name"
assert metadata.record_sets[2].fields[0].source.file_object == "passengers.csv"
metadata.rename_distribution("passengers.csv", "NEW_PASSENGERS.CSV")
assert metadata.record_sets[2].fields[0].source.file_object == "NEW_PASSENGERS.CSV"
2 changes: 1 addition & 1 deletion editor/cypress/e2e/displayErrors.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ VERSIONS.forEach((version) => {
fileName: fixture,
mimeType: "text/json",
};
cy.get("[data-testid='stFileUploadDropzone']").attachFile(file, {
cy.get("[data-testid='stFileUploader']").attachFile(file, {
force: true,
subjectType: "drag-n-drop",
events: ["dragenter", "drop"],
Expand Down
4 changes: 2 additions & 2 deletions editor/cypress/e2e/loadCroissant.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ VERSIONS.forEach((version) => {
fileName: fixture,
mimeType: "text/json",
};
cy.get("[data-testid='stFileUploadDropzone']").attachFile(file, {
cy.get("[data-testid='stFileUploader']").attachFile(file, {
force: true,
subjectType: "drag-n-drop",
events: ["dragenter", "drop"],
Expand All @@ -39,7 +39,7 @@ VERSIONS.forEach((version) => {
fileName: fixture,
mimeType: "text/json",
};
cy.get("[data-testid='stFileUploadDropzone']").attachFile(file, {
cy.get("[data-testid='stFileUploader']").attachFile(file, {
force: true,
subjectType: "drag-n-drop",
events: ["dragenter", "drop"],
Expand Down
2 changes: 1 addition & 1 deletion editor/cypress/e2e/renameDistribution.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ VERSIONS.forEach((version) => {
fileName: "titanic.json",
mimeType: "text/json",
};
cy.get("[data-testid='stFileUploadDropzone']").attachFile(file, {
cy.get("[data-testid='stFileUploader']").attachFile(file, {
force: true,
subjectType: "drag-n-drop",
events: ["dragenter", "drop"],
Expand Down
4 changes: 2 additions & 2 deletions editor/cypress/e2e/uploadCsv.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ describe('Editor loads a local CSV as a resource', () => {
fileName: 'base.csv', mimeType: 'text/csv',
}
cy.get(
"[data-testid='stFileUploadDropzone']",
"[data-testid='stFileUploader']",
).attachFile(file, {
force: true,
subjectType: "drag-n-drop",
events: ["dragenter", "drop"],
})
})
cy.get('.uploadedFileData').contains('base.csv')
cy.get('.stFileUploaderFileData').contains('base.csv')
cy.get('button').contains('Upload').click()
// The file is uploaded, so we can click on it to see the details.
// Waiting a few seconds to wait for the resource to download.
Expand Down
10 changes: 6 additions & 4 deletions editor/events/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,9 @@ def handle_field_change(
elif change == FieldEvent.DATA_TYPE:
field.data_types = [str_to_mlc_data_type(value)]
elif change == FieldEvent.SOURCE:
node_type = "field" if "/" in value else "distribution"
source = mlc.Source(uid=value, node_type=node_type)
source = (
mlc.Source(field=value) if "/" in value else mlc.Source(file_object=value)
)
field.source = source
elif change == FieldEvent.SOURCE_EXTRACT:
source = field.source
Expand Down Expand Up @@ -138,8 +139,9 @@ def handle_field_change(
if number is not None and number < len(field.source.transforms):
field.source.transforms[number] = mlc.Transform(separator=value)
elif change == FieldEvent.REFERENCE:
node_type = "field" if "/" in value else "distribution"
source = mlc.Source(uid=value, node_type=node_type)
source = (
mlc.Source(field=value) if "/" in value else mlc.Source(file_object=value)
)
field.references = source
elif change == FieldEvent.REFERENCE_EXTRACT:
source = field.references
Expand Down
3 changes: 1 addition & 2 deletions editor/views/jsonld.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ def render_jsonld():
description=field["description"],
data_types=field["data_type"],
source=mlc.Source(
uid=file.name,
node_type="distribution",
distribution=file.name,
extract=mlc.Extract(column=field["name"]),
),
)
Expand Down
12 changes: 7 additions & 5 deletions editor/views/record_sets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@ def test_find_joins():
Field(
id="field1",
name="field1",
source=mlc.Source(id="some_csv", extract=mlc.Extract(column="some_column")),
references=mlc.Source(id="some_record_set/some_field"),
source=mlc.Source(
file_object="some_csv", extract=mlc.Extract(column="some_column")
),
references=mlc.Source(field="some_record_set/some_field"),
),
Field(id="field2", name="field2", source=mlc.Source(id="foo/bar")),
Field(id="field2", name="field2", source=mlc.Source(field="foo/bar")),
Field(
id="field3",
name="field3",
source=mlc.Source(id="some_record_set/some_field"),
references=mlc.Source(id="some_other_record_set/some_other_field"),
source=mlc.Source(field="some_record_set/some_field"),
references=mlc.Source(field="some_other_record_set/some_other_field"),
),
]
assert _find_joins(fields) == set(
Expand Down
4 changes: 2 additions & 2 deletions editor/views/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def render_source(
on_change=handle_field_change,
args=(FieldEvent.SOURCE, field, key),
)
if source.node_type == "distribution":
if source.distribution or source.file_object or source.file_set:
extract = col2.selectbox(
needed_field("Extract"),
index=_get_extract_index(source),
Expand Down Expand Up @@ -310,7 +310,7 @@ def render_references(
on_change=handle_field_change,
args=(FieldEvent.REFERENCE, field, key),
)
if references.node_type == "distribution":
if references.distribution or references.file_object or references.file_set:
key = f"{key}-extract-references"
extract = col2.selectbox(
needed_field("Extract the reference"),
Expand Down
Loading

0 comments on commit 889143e

Please sign in to comment.