Replicate dataclass logic for FileSet and RecordSet. (#577)

mlcommons · Mar 4, 2024 · 93374aa · 93374aa
1 parent 39eb1dc
commit 93374aa
Show file tree

Hide file tree

Showing 7 changed files with 218 additions and 184 deletions.
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py b/python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py
@@ -9,9 +9,15 @@
 
 from mlcroissant._src.core import constants
 from mlcroissant._src.core.context import Context
+from mlcroissant._src.core.data_types import check_expected_type
+from mlcroissant._src.core.dataclasses import jsonld_fields
 from mlcroissant._src.core.issues import Issues
+from mlcroissant._src.core.json_ld import box_singleton_list
+from mlcroissant._src.core.json_ld import remove_empty_values
+from mlcroissant._src.core.json_ld import unbox_singleton_list
 from mlcroissant._src.core.types import Json
 from mlcroissant._src.core.uuid import generate_uuid
+from mlcroissant._src.core.uuid import uuid_from_jsonld
 
 NAME_REGEX = "[a-zA-Z0-9\\-_\\.]+"
 _MAX_NAME_LENGTH = 255
@@ -270,3 +276,50 @@ def __deepcopy__(self, memo):
         copy = self.__class__(**kwargs)  # pytype: disable=not-instantiable
         memo[id(self)] = copy
         return copy
+
+
+class NodeV2(Node):
+    """Extends Node. When the migration is complete, merge `Node` and `NodeV2`."""
+
+    def to_json(self) -> Json:
+        """Converts the Python class to JSON."""
+        cls = self.__class__
+        jsonld = {
+            "@type": self.ctx.rdf.shorten_value(cls._JSONLD_TYPE(self.ctx)),
+            "@id": None if self.ctx.is_v0() else self.id,
+        }
+        for field in jsonld_fields(self):
+            url = field.call_url(self.ctx)
+            key = url.split("/")[-1]
+            value = getattr(self, field.name)
+            value = field.call_to_jsonld(self.ctx, value)
+            if field.cardinality == "MANY" and field.name != "fields":
+                value = unbox_singleton_list(value)
+            jsonld[key] = value
+        return remove_empty_values(jsonld)
+
+    @classmethod
+    def from_jsonld(cls, ctx: Context, jsonld: Json):
+        """Creates a Python class from JSON-LD."""
+        check_expected_type(ctx.issues, jsonld, cls._JSONLD_TYPE(ctx))
+        kwargs = {}
+        for field in jsonld_fields(cls):
+            url = field.call_url(ctx)
+            value = jsonld.get(url)
+            value = field.call_from_jsonld(ctx, value)
+            if field.cardinality == "MANY":
+                value = box_singleton_list(value)
+            if value:
+                kwargs[field.name] = value
+        # Normalize name to be at least an empty str:
+        kwargs["name"] = kwargs.get("name", "")
+        return cls(
+            ctx=ctx,
+            id=uuid_from_jsonld(jsonld),
+            **kwargs,
+        )
+
+    @classmethod
+    def _JSONLD_TYPE(cls, ctx: Context):
+        del ctx
+        raise NotImplementedError("Output the right JSON-LD type.")
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/field.py
@@ -150,6 +150,8 @@ def to_json(self) -> Json:
     @classmethod
     def from_jsonld(cls, ctx: Context, field: Json) -> Field:
         """Creates a `Field` from JSON-LD."""
+        if isinstance(field, list):
+            return [cls.from_jsonld(ctx, f) for f in field]
         check_expected_type(
             ctx.issues,
             field,

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_object.py
@@ -9,42 +9,36 @@
 from mlcroissant._src.core import constants
 from mlcroissant._src.core.constants import ML_COMMONS
 from mlcroissant._src.core.context import Context
-from mlcroissant._src.core.data_types import check_expected_type
 from mlcroissant._src.core.dataclasses import jsonld_field
-from mlcroissant._src.core.dataclasses import jsonld_fields
 from mlcroissant._src.core.dataclasses import JsonldField
-from mlcroissant._src.core.json_ld import box_singleton_list
-from mlcroissant._src.core.json_ld import remove_empty_values
-from mlcroissant._src.core.json_ld import unbox_singleton_list
-from mlcroissant._src.core.types import Json
 from mlcroissant._src.core.uuid import formatted_uuid_to_json
 from mlcroissant._src.core.uuid import uuid_from_jsonld
-from mlcroissant._src.structure_graph.base_node import Node
+from mlcroissant._src.structure_graph.base_node import NodeV2
 from mlcroissant._src.structure_graph.nodes.source import Source
 
 OriginalField = dataclasses.Field
 dataclasses.Field = JsonldField  # type: ignore
 
 
 @dataclasses.dataclass(eq=False, repr=False)
-class FileObject(Node):
+class FileObject(NodeV2):
     """Nodes to describe a dataset FileObject (distribution)."""
 
     # pytype: disable=annotation-type-mismatch
     content_url: str | None = jsonld_field(
         default=None,
         description=(
-            "Actual bytes of the media object, for example the image file or"
-            " video file."
+            "Actual bytes of the media object, for example the image file or video"
+            " file."
         ),
         input_types=[SDO.URL],
         url=SDO.contentUrl,
     )
     content_size: str | None = jsonld_field(
         default=None,
         description=(
-            "File size in (mega/kilo/…)bytes. Defaults to bytes if a unit is"
-            " not specified."
+            "File size in (mega/kilo/...)bytes. Defaults to bytes if a unit is not"
+            " specified."
         ),
         input_types=[SDO.Text],
         url=SDO.contentSize,
@@ -53,10 +47,10 @@ class FileObject(Node):
         cardinality="MANY",
         default_factory=list,
         description=(
-            "Another FileObject or FileSet that this one is contained in, e.g.,"
-            " in the case of a file extracted from an archive. When this"
-            " property is present, the contentUrl is evaluated as a relative"
-            " path within the container object"
+            "Another FileObject or FileSet that this one is contained in, e.g., in the"
+            " case of a file extracted from an archive. When this property is present,"
+            " the contentUrl is evaluated as a relative path within the container"
+            " object"
         ),
         from_jsonld=lambda ctx, contained_in: uuid_from_jsonld(contained_in),
         input_types=[SDO.Text],
@@ -83,11 +77,11 @@ class FileObject(Node):
         url=lambda ctx: ML_COMMONS(ctx).md5,
     )
     name: str = jsonld_field(
-        default=None,
+        default="",
         description=(
-            "The name of the file.  As much as possible, the name should"
-            " reflect the name of the file as downloaded, including the file"
-            " extension. e.g. “images.zip”."
+            "The name of the file.  As much as possible, the name should reflect the"
+            " name of the file as downloaded, including the file extension. e.g."
+            ' "images.zip".'
         ),
         input_types=[SDO.Text],
         required=True,
@@ -97,8 +91,8 @@ class FileObject(Node):
         cardinality="MANY",
         default_factory=list,
         description=(
-            "URL (or local name) of a FileObject with the same content, but in"
-            " a different format."
+            "URL (or local name) of a FileObject with the same content, but in a"
+            " different format."
         ),
         input_types=[SDO.URL],
         url=SDO.sameAs,
@@ -133,45 +127,5 @@ def _JSONLD_TYPE(cls, ctx: Context):
         """Gets the class' JSON-LD @type."""
         return constants.SCHEMA_ORG_FILE_OBJECT(ctx)
 
-    # [Proposal] This method would move to `Node`, as it's now generic.
-    def to_json(self) -> Json:
-        """Converts the `FileObject` to JSON."""
-        cls = self.__class__
-        jsonld = {
-            "@type": self.ctx.rdf.shorten_value(cls._JSONLD_TYPE(self.ctx)),
-            "@id": None if self.ctx.is_v0() else self.id,
-        }
-        for field in jsonld_fields(self):
-            url = field.call_url(self.ctx)
-            key = url.split("/")[-1]
-            value = getattr(self, field.name)
-            value = field.call_to_jsonld(self.ctx, value)
-            if field.cardinality == "MANY":
-                value = unbox_singleton_list(value)
-            jsonld[key] = value
-        return remove_empty_values(jsonld)
-
-    # [Proposal] This method would move to `Node`, as it's now generic.
-    @classmethod
-    def from_jsonld(cls, ctx: Context, jsonld: Json) -> FileObject:
-        """Creates a `FileObject` from JSON-LD."""
-        check_expected_type(ctx.issues, jsonld, cls._JSONLD_TYPE(ctx))
-        kwargs = {}
-        for field in jsonld_fields(cls):
-            url = field.call_url(ctx)
-            value = jsonld.get(url)
-            value = field.call_from_jsonld(ctx, value)
-            if field.cardinality == "MANY":
-                value = box_singleton_list(value)
-            if value:
-                kwargs[field.name] = value
-        # Normalize name to be at least an empty str:
-        kwargs["name"] = kwargs.get("name", "")
-        return cls(
-            ctx=ctx,
-            id=uuid_from_jsonld(jsonld),
-            **kwargs,
-        )
-
 
 dataclasses.Field = OriginalField  # type: ignore
diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py b/python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py
@@ -3,80 +3,90 @@
 from __future__ import annotations
 
 import dataclasses
-from typing import Any
+
+from rdflib.namespace import SDO
 
 from mlcroissant._src.core import constants
 from mlcroissant._src.core.context import Context
-from mlcroissant._src.core.data_types import check_expected_type
-from mlcroissant._src.core.json_ld import box_singleton_list
-from mlcroissant._src.core.json_ld import remove_empty_values
-from mlcroissant._src.core.json_ld import unbox_singleton_list
-from mlcroissant._src.core.types import Json
+from mlcroissant._src.core.dataclasses import jsonld_field
+from mlcroissant._src.core.dataclasses import JsonldField
+from mlcroissant._src.core.uuid import formatted_uuid_to_json
 from mlcroissant._src.core.uuid import uuid_from_jsonld
-from mlcroissant._src.structure_graph.base_node import Node
+from mlcroissant._src.structure_graph.base_node import NodeV2
+
+OriginalField = dataclasses.Field
+dataclasses.Field = JsonldField  # type: ignore
 
 
 @dataclasses.dataclass(eq=False, repr=False)
-class FileSet(Node):
+class FileSet(NodeV2):
     """Nodes to describe a dataset FileSet (distribution)."""
 
-    contained_in: list[str] | None = None
-    description: str | None = None
-    encoding_format: str | None = None
-    excludes: list[str] | None = None
-    includes: list[str] | None = None
-    name: str = ""
+    # pytype: disable=annotation-type-mismatch
+    contained_in: list[str] | None = jsonld_field(
+        cardinality="MANY",
+        default_factory=list,
+        description=(
+            "Another FileObject or FileSet that this one is contained in, e.g., in the"
+            " case of a file extracted from an archive. When this property is present,"
+            " the contentUrl is evaluated as a relative path within the container"
+            " object"
+        ),
+        from_jsonld=lambda ctx, contained_in: uuid_from_jsonld(contained_in),
+        input_types=[SDO.Text],
+        to_jsonld=lambda ctx, contained_in: [
+            formatted_uuid_to_json(ctx, uuid) for uuid in contained_in
+        ],
+        url=SDO.containedIn,
+    )
+    description: str | None = jsonld_field(
+        default=None,
+        input_types=[SDO.Text],
+        url=SDO.description,
+    )
+    encoding_format: str | None = jsonld_field(
+        default=None,
+        description="The format of the file, given as a mime type.",
+        input_types=[SDO.Text],
+        required=True,
+        url=SDO.encodingFormat,
+    )
+    excludes: list[str] | None = jsonld_field(
+        cardinality="MANY",
+        default=None,
+        description="A glob pattern that specifies the files to exclude.",
+        input_types=[SDO.Text],
+        url=lambda ctx: constants.ML_COMMONS_EXCLUDES(ctx),
+    )
+    includes: list[str] | None = jsonld_field(
+        cardinality="MANY",
+        default=None,
+        description="A glob pattern that specifies the files to include.",
+        input_types=[SDO.Text],
+        url=lambda ctx: constants.ML_COMMONS_INCLUDES(ctx),
+    )
+    name: str = jsonld_field(
+        default="",
+        description=(
+            "The name of the file.  As much as possible, the name should reflect the"
+            " name of the file as downloaded, including the file extension. e.g."
+            ' "images.zip".'
+        ),
+        input_types=[SDO.Text],
+        url=SDO.name,
+    )
+    # pytype: enable=annotation-type-mismatch
 
     def __post_init__(self):
         """Checks arguments of the node."""
         uuid_field = "name" if self.ctx.is_v0() else "id"
         self.validate_name()
         self.assert_has_mandatory_properties("includes", "encoding_format", uuid_field)
 
-    def to_json(self) -> Json:
-        """Converts the `FileSet` to JSON."""
-        contained_in: Any = self.contained_in
-        if not self.ctx.is_v0() and contained_in:
-            contained_in = [{"@id": uuid} for uuid in contained_in]
-        contained_in = unbox_singleton_list(contained_in)
-
-        return remove_empty_values({
-            "@type": "sc:FileSet" if self.ctx.is_v0() else "cr:FileSet",
-            "@id": None if self.ctx.is_v0() else self.uuid,
-            "name": self.name,
-            "description": self.description,
-            "containedIn": contained_in,
-            "encodingFormat": self.encoding_format,
-            "excludes": unbox_singleton_list(self.excludes),
-            "includes": unbox_singleton_list(self.includes),
-        })
-
     @classmethod
-    def from_jsonld(
-        cls,
-        ctx: Context,
-        file_set: Json,
-    ) -> FileSet:
-        """Creates a `FileSet` from JSON-LD."""
-        check_expected_type(ctx.issues, file_set, constants.SCHEMA_ORG_FILE_SET(ctx))
+    def _JSONLD_TYPE(cls, ctx: Context):
+        """Gets the class' JSON-LD @type."""
+        return constants.SCHEMA_ORG_FILE_SET(ctx)
 
-        contained_in = box_singleton_list(
-            file_set.get(constants.SCHEMA_ORG_CONTAINED_IN)
-        )
-        if contained_in is not None and not ctx.is_v0():
-            contained_in = [uuid_from_jsonld(source) for source in contained_in]
 
-        return cls(
-            ctx=ctx,
-            contained_in=contained_in,
-            description=file_set.get(constants.SCHEMA_ORG_DESCRIPTION),
-            encoding_format=file_set.get(constants.SCHEMA_ORG_ENCODING_FORMAT),
-            excludes=box_singleton_list(
-                file_set.get(constants.ML_COMMONS_EXCLUDES(ctx))
-            ),
-            includes=box_singleton_list(
-                file_set.get(constants.ML_COMMONS_INCLUDES(ctx))
-            ),
-            name=file_set.get(constants.SCHEMA_ORG_NAME, ""),
-            id=uuid_from_jsonld(file_set),
-        )
+dataclasses.Field = OriginalField  # type: ignore