Skip to content

Commit

Permalink
Update names (for 0.8) and IDs (for 1.0) in the editor. (#570)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcenacp authored Feb 27, 2024
1 parent 57f57db commit 452f7a5
Show file tree
Hide file tree
Showing 15 changed files with 249 additions and 132 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -262,3 +262,38 @@ jobs:

- name: PyTest
run: make pytest

editor-e2e-test:
name: Editor End-to-End Tests
runs-on: ubuntu-latest
defaults:
run:
working-directory: ./editor
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'

- name: Install library
run: pip install -r requirements.txt

- name: Install mlcroissant
run: sudo apt-get install -y libgraphviz-dev && pip install .[dev]
working-directory: ./python/mlcroissant

- uses: cypress-io/github-action@v6
with:
start: streamlit run app.py
wait-on: 'http://localhost:8501'
working-directory: ./editor

- name: Upload screenshots
uses: actions/upload-artifact@v3
if: failure()
with:
name: cypress-screenshots
path: ./editor/cypress/screenshots
retention-days: 1
14 changes: 10 additions & 4 deletions editor/core/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,10 @@ def file_from_url(url: str, names: set[str], folder: epath.Path) -> FileObject:
sha256 = _sha256(file.read())
file_type = guess_file_type(file_path)
df = get_dataframe(file_type, file_path)
name = find_unique_name(names, url.split("/")[-1])
return FileObject(
name=find_unique_name(names, url.split("/")[-1]),
id=name,
name=name,
description="",
content_url=url,
encoding_format=file_type.encoding_format,
Expand All @@ -206,8 +208,10 @@ def file_from_upload(
f.write(value)
file_type = guess_file_type(file_path)
df = get_dataframe(file_type, file)
name = find_unique_name(names, file.name)
return FileObject(
name=find_unique_name(names, file.name),
id=name,
name=name,
description="",
content_url=content_url,
encoding_format=file_type.encoding_format,
Expand All @@ -222,9 +226,11 @@ def file_from_form(
) -> FileObject | FileSet:
"""Creates a file based on manually added fields."""
if type == FILE_OBJECT:
return FileObject(name=find_unique_name(names, "file_object"), folder=folder)
name = find_unique_name(names, "file_object")
return FileObject(id=name, name=name, folder=folder)
elif type == FILE_SET:
return FileSet(name=find_unique_name(names, "file_set"))
name = find_unique_name(names, "file_set")
return FileSet(id=name, name=name)
else:
raise ValueError("type has to be one of FILE_OBJECT, FILE_SET")

Expand Down
5 changes: 4 additions & 1 deletion editor/core/record_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,19 @@ def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[Recor
extract=mlc.Extract(column=column),
)
field = Field(
id=column,
name=column,
data_types=[convert_dtype(value)],
source=source,
references=mlc.Source(),
)
fields.append(field)
name = find_unique_name(names, file.name + "_record_set")
return [
RecordSet(
id=name,
fields=fields,
name=find_unique_name(names, file.name + "_record_set"),
name=name,
description="",
)
]
74 changes: 48 additions & 26 deletions editor/core/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import dataclasses
import datetime
from typing import Any
import uuid

from etils import epath
import pandas as pd
Expand All @@ -33,9 +34,6 @@ def create_class(mlc_class: type, instance: Any, **kwargs) -> Any:
name = field.name
if hasattr(instance, name) and name not in kwargs:
params[name] = getattr(instance, name)
if "uuid" in params and params.get("uuid") is None:
# Let mlcroissant handle the default value
del params["uuid"]
return mlc_class(**params, **kwargs)


Expand Down Expand Up @@ -127,11 +125,22 @@ class SelectedRecordSet:


@dataclasses.dataclass
class FileObject:
"""FileObject analogue for editor"""

class Node:
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
id: str | None = None
name: str | None = None

def get_name_or_id(self):
if self.ctx.is_v0():
return self.name
else:
return self.id


@dataclasses.dataclass
class FileObject(Node):
"""FileObject analogue for editor"""

description: str | None = None
contained_in: list[str] | None = dataclasses.field(default_factory=list)
content_size: str | None = None
Expand All @@ -140,65 +149,51 @@ class FileObject:
sha256: str | None = None
df: pd.DataFrame | None = None
folder: epath.PathLike | None = None
id: str | None = None


@dataclasses.dataclass
class FileSet:
class FileSet(Node):
"""FileSet analogue for editor"""

ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
contained_in: list[str] = dataclasses.field(default_factory=list)
description: str | None = None
encoding_format: str | None = ""
includes: str | None = ""
name: str = ""
id: str | None = None


@dataclasses.dataclass
class Field:
class Field(Node):
"""Field analogue for editor"""

ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
name: str | None = None
description: str | None = None
data_types: str | list[str] | None = None
source: mlc.Source | None = None
references: mlc.Source | None = None
id: str | None = None


@dataclasses.dataclass
class RecordSet:
class RecordSet(Node):
"""Record Set analogue for editor"""

ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
name: str = ""
data: list[Any] | None = None
description: str | None = None
is_enumeration: bool | None = None
key: str | list[str] | None = None
fields: list[Field] = dataclasses.field(default_factory=list)
id: str | None = None


@dataclasses.dataclass
class Metadata:
class Metadata(Node):
"""main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""

name: str = ""
description: str | None = None
cite_as: str | None = None
context: dict = dataclasses.field(default_factory=dict)
creators: list[mlc.PersonOrOrganization] = dataclasses.field(default_factory=list)
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
data_biases: str | None = None
data_collection: str | None = None
date_published: datetime.datetime | None = None
license: str | None = ""
personal_sensitive_information: str | None = None
id: str | None = None
url: str = ""
distribution: list[FileObject | FileSet] = dataclasses.field(default_factory=list)
record_sets: list[RecordSet] = dataclasses.field(default_factory=list)
Expand Down Expand Up @@ -272,6 +267,25 @@ def rename_field(self, old_name: str, new_name: str):
new_uuid = references.id.replace(old_name, new_name, 1)
self.record_sets[i].fields[j].references.id = new_uuid

def rename_id(self, old_id: str, new_id: str):
for resource in self.distribution:
if resource.id == old_id:
resource.id = new_id
if resource.contained_in and old_id in resource.contained_in:
resource.contained_in = [
new_id if uuid == old_id else uuid for uuid in resource.contained_in
]
for record_set in self.record_sets:
if record_set.id == old_id:
record_set.id = new_id
for field in record_set.fields:
if field.id == old_id:
field.id = new_id
if field.source and field.source.id == old_id:
field.source.id = new_id
if field.references and field.references.id == old_id:
field.references.id = new_id

def add_distribution(self, distribution: FileSet | FileObject) -> None:
self.distribution.append(distribution)

Expand Down Expand Up @@ -352,8 +366,16 @@ def from_canonical(cls, canonical_metadata: mlc.Metadata) -> Metadata:
)

def names(self) -> set[str]:
nodes = self.distribution + self.record_sets
return set([node.name for node in nodes])
distribution = set()
record_sets = set()
fields = set()
for resource in self.distribution:
distribution.add(resource.get_name_or_id())
for record_set in self.record_sets:
record_sets.add(record_set.get_name_or_id())
for field in record_set.fields:
fields.add(field.get_name_or_id())
return distribution.union(record_sets).union(fields)


class OpenTab:
Expand Down
71 changes: 38 additions & 33 deletions editor/cypress/e2e/createManually.cy.js
Original file line number Diff line number Diff line change
@@ -1,42 +1,47 @@
/// <reference types="cypress" />

import 'cypress-file-upload';
import 'cypress-iframe';
import "cypress-file-upload";
import "cypress-iframe";

describe("Create a resource manually", () => {
it("should allow adding a FileObject resource", () => {
cy.visit("http://localhost:8501");
cy.get("button").contains("Create").click();
cy.get('input[aria-label="Name:red[*]"]').type("MyDataset{enter}");
cy.contains("Croissant files are composed of three layers:");
cy.enter('[title="components.tabs.tabs_component"]').then((getBody) => {
getBody().contains("Metadata").click();
});
cy.get('input[aria-label="URL"]').type("https://mydataset.com{enter}", {
force: true,
});

describe('Create a resource manually', () => {
it('should allow adding a FileObject resource', () => {
// Streamlit starts on :8501.
cy.visit('http://localhost:8501')
cy.get('button').contains('Create').click()
cy.get('input[aria-label="Name:red[*]"]').type('MyDataset{enter}')
cy.contains("Croissant files are composed of three layers:")
cy.enter('[title="components.tabs.tabs_component"]').then(getBody => {
getBody().contains('Metadata').click()
})
cy.get('input[aria-label="URL"]').type('https://mydataset.com{enter}', {force: true})

// Create a resource manually.
cy.enter('[title="components.tabs.tabs_component"]').then(getBody => {
getBody().contains('Resources').click()
})
cy.get('[data-testid="stMarkdownContainer"]').contains('Add manually').click()
cy.get('button').contains('Upload').click()

cy.enter('[title="components.tabs.tabs_component"]').then((getBody) => {
getBody().contains("Resources").click();
});
cy.get('[data-testid="stMarkdownContainer"]')
.contains("Add manually")
.click();
cy.get("button").contains("Upload").click();

// The file is created, so we can click on it to see the details.
cy.enter('[title="components.tree.tree_component"]').then(getBody => {
getBody().contains('file_object').click()
})
cy.enter('[title="components.tree.tree_component"]').then((getBody) => {
getBody().contains("file_object").click();
});
// We can edit it
cy.get('input[aria-label="Name:red[*]"]').type('{selectall}{backspace}test.csv{enter}')
cy.wait(1000)
cy.enter('[title="components.tree.tree_component"]').then(getBody => {
getBody().contains('test.csv').click()
})
cy.get('input[aria-label="SHA256:red[*]"]').type('abcdefgh1234567{enter}')
cy.get('input[aria-label="ID:red[*]"]').type(
"{selectall}{backspace}test.csv{enter}"
);
cy.wait(1000);
cy.enter('[title="components.tree.tree_component"]').then((getBody) => {
getBody().contains("test.csv").click();
});
cy.get('input[aria-label="SHA256:red[*]"]').type("abcdefgh1234567{enter}");

cy.get('input[aria-label="SHA256:red[*]"]')
.should('have.value', 'abcdefgh1234567')
})
})
cy.get('input[aria-label="SHA256:red[*]"]').should(
"have.value",
"abcdefgh1234567"
);
});
});
3 changes: 2 additions & 1 deletion editor/cypress/e2e/displayErrors.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ VERSIONS.forEach((version) => {
cy.contains("annotations (4 fields)");
cy.contains("split_enums (2 fields)").click();
cy.contains("Generating the dataset...").should("not.exist");
cy.get('input[aria-label="Name:red[*]"][value="split_enums"]')
const input = version == "0.8" ? "Name" : "ID"
cy.get(`input[aria-label="${input}:red[*]"][value="split_enums"]`)
.should("be.visible")
.type("{selectall}{backspace}{enter}");
cy.wait(2000);
Expand Down
Loading

0 comments on commit 452f7a5

Please sign in to comment.