Skip to content

Commit

Permalink
Replicate dataclass logic for FileSet and RecordSet. (#577)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcenacp authored Mar 4, 2024
1 parent 39eb1dc commit 93374aa
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 184 deletions.
53 changes: 53 additions & 0 deletions python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,15 @@

from mlcroissant._src.core import constants
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.data_types import check_expected_type
from mlcroissant._src.core.dataclasses import jsonld_fields
from mlcroissant._src.core.issues import Issues
from mlcroissant._src.core.json_ld import box_singleton_list
from mlcroissant._src.core.json_ld import remove_empty_values
from mlcroissant._src.core.json_ld import unbox_singleton_list
from mlcroissant._src.core.types import Json
from mlcroissant._src.core.uuid import generate_uuid
from mlcroissant._src.core.uuid import uuid_from_jsonld

NAME_REGEX = "[a-zA-Z0-9\\-_\\.]+"
_MAX_NAME_LENGTH = 255
Expand Down Expand Up @@ -270,3 +276,50 @@ def __deepcopy__(self, memo):
copy = self.__class__(**kwargs) # pytype: disable=not-instantiable
memo[id(self)] = copy
return copy


class NodeV2(Node):
"""Extends Node. When the migration is complete, merge `Node` and `NodeV2`."""

def to_json(self) -> Json:
"""Converts the Python class to JSON."""
cls = self.__class__
jsonld = {
"@type": self.ctx.rdf.shorten_value(cls._JSONLD_TYPE(self.ctx)),
"@id": None if self.ctx.is_v0() else self.id,
}
for field in jsonld_fields(self):
url = field.call_url(self.ctx)
key = url.split("/")[-1]
value = getattr(self, field.name)
value = field.call_to_jsonld(self.ctx, value)
if field.cardinality == "MANY" and field.name != "fields":
value = unbox_singleton_list(value)
jsonld[key] = value
return remove_empty_values(jsonld)

@classmethod
def from_jsonld(cls, ctx: Context, jsonld: Json):
"""Creates a Python class from JSON-LD."""
check_expected_type(ctx.issues, jsonld, cls._JSONLD_TYPE(ctx))
kwargs = {}
for field in jsonld_fields(cls):
url = field.call_url(ctx)
value = jsonld.get(url)
value = field.call_from_jsonld(ctx, value)
if field.cardinality == "MANY":
value = box_singleton_list(value)
if value:
kwargs[field.name] = value
# Normalize name to be at least an empty str:
kwargs["name"] = kwargs.get("name", "")
return cls(
ctx=ctx,
id=uuid_from_jsonld(jsonld),
**kwargs,
)

@classmethod
def _JSONLD_TYPE(cls, ctx: Context):
del ctx
raise NotImplementedError("Output the right JSON-LD type.")
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ def to_json(self) -> Json:
@classmethod
def from_jsonld(cls, ctx: Context, field: Json) -> Field:
"""Creates a `Field` from JSON-LD."""
if isinstance(field, list):
return [cls.from_jsonld(ctx, f) for f in field]
check_expected_type(
ctx.issues,
field,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,36 @@
from mlcroissant._src.core import constants
from mlcroissant._src.core.constants import ML_COMMONS
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.data_types import check_expected_type
from mlcroissant._src.core.dataclasses import jsonld_field
from mlcroissant._src.core.dataclasses import jsonld_fields
from mlcroissant._src.core.dataclasses import JsonldField
from mlcroissant._src.core.json_ld import box_singleton_list
from mlcroissant._src.core.json_ld import remove_empty_values
from mlcroissant._src.core.json_ld import unbox_singleton_list
from mlcroissant._src.core.types import Json
from mlcroissant._src.core.uuid import formatted_uuid_to_json
from mlcroissant._src.core.uuid import uuid_from_jsonld
from mlcroissant._src.structure_graph.base_node import Node
from mlcroissant._src.structure_graph.base_node import NodeV2
from mlcroissant._src.structure_graph.nodes.source import Source

OriginalField = dataclasses.Field
dataclasses.Field = JsonldField # type: ignore


@dataclasses.dataclass(eq=False, repr=False)
class FileObject(Node):
class FileObject(NodeV2):
"""Nodes to describe a dataset FileObject (distribution)."""

# pytype: disable=annotation-type-mismatch
content_url: str | None = jsonld_field(
default=None,
description=(
"Actual bytes of the media object, for example the image file or"
" video file."
"Actual bytes of the media object, for example the image file or video"
" file."
),
input_types=[SDO.URL],
url=SDO.contentUrl,
)
content_size: str | None = jsonld_field(
default=None,
description=(
"File size in (mega/kilo/)bytes. Defaults to bytes if a unit is"
" not specified."
"File size in (mega/kilo/...)bytes. Defaults to bytes if a unit is not"
" specified."
),
input_types=[SDO.Text],
url=SDO.contentSize,
Expand All @@ -53,10 +47,10 @@ class FileObject(Node):
cardinality="MANY",
default_factory=list,
description=(
"Another FileObject or FileSet that this one is contained in, e.g.,"
" in the case of a file extracted from an archive. When this"
" property is present, the contentUrl is evaluated as a relative"
" path within the container object"
"Another FileObject or FileSet that this one is contained in, e.g., in the"
" case of a file extracted from an archive. When this property is present,"
" the contentUrl is evaluated as a relative path within the container"
" object"
),
from_jsonld=lambda ctx, contained_in: uuid_from_jsonld(contained_in),
input_types=[SDO.Text],
Expand All @@ -83,11 +77,11 @@ class FileObject(Node):
url=lambda ctx: ML_COMMONS(ctx).md5,
)
name: str = jsonld_field(
default=None,
default="",
description=(
"The name of the file. As much as possible, the name should"
" reflect the name of the file as downloaded, including the file"
" extension. e.g. “images.zip”."
"The name of the file. As much as possible, the name should reflect the"
" name of the file as downloaded, including the file extension. e.g."
' "images.zip".'
),
input_types=[SDO.Text],
required=True,
Expand All @@ -97,8 +91,8 @@ class FileObject(Node):
cardinality="MANY",
default_factory=list,
description=(
"URL (or local name) of a FileObject with the same content, but in"
" a different format."
"URL (or local name) of a FileObject with the same content, but in a"
" different format."
),
input_types=[SDO.URL],
url=SDO.sameAs,
Expand Down Expand Up @@ -133,45 +127,5 @@ def _JSONLD_TYPE(cls, ctx: Context):
"""Gets the class' JSON-LD @type."""
return constants.SCHEMA_ORG_FILE_OBJECT(ctx)

# [Proposal] This method would move to `Node`, as it's now generic.
def to_json(self) -> Json:
"""Converts the `FileObject` to JSON."""
cls = self.__class__
jsonld = {
"@type": self.ctx.rdf.shorten_value(cls._JSONLD_TYPE(self.ctx)),
"@id": None if self.ctx.is_v0() else self.id,
}
for field in jsonld_fields(self):
url = field.call_url(self.ctx)
key = url.split("/")[-1]
value = getattr(self, field.name)
value = field.call_to_jsonld(self.ctx, value)
if field.cardinality == "MANY":
value = unbox_singleton_list(value)
jsonld[key] = value
return remove_empty_values(jsonld)

# [Proposal] This method would move to `Node`, as it's now generic.
@classmethod
def from_jsonld(cls, ctx: Context, jsonld: Json) -> FileObject:
"""Creates a `FileObject` from JSON-LD."""
check_expected_type(ctx.issues, jsonld, cls._JSONLD_TYPE(ctx))
kwargs = {}
for field in jsonld_fields(cls):
url = field.call_url(ctx)
value = jsonld.get(url)
value = field.call_from_jsonld(ctx, value)
if field.cardinality == "MANY":
value = box_singleton_list(value)
if value:
kwargs[field.name] = value
# Normalize name to be at least an empty str:
kwargs["name"] = kwargs.get("name", "")
return cls(
ctx=ctx,
id=uuid_from_jsonld(jsonld),
**kwargs,
)


dataclasses.Field = OriginalField # type: ignore
126 changes: 68 additions & 58 deletions python/mlcroissant/mlcroissant/_src/structure_graph/nodes/file_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,80 +3,90 @@
from __future__ import annotations

import dataclasses
from typing import Any

from rdflib.namespace import SDO

from mlcroissant._src.core import constants
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.data_types import check_expected_type
from mlcroissant._src.core.json_ld import box_singleton_list
from mlcroissant._src.core.json_ld import remove_empty_values
from mlcroissant._src.core.json_ld import unbox_singleton_list
from mlcroissant._src.core.types import Json
from mlcroissant._src.core.dataclasses import jsonld_field
from mlcroissant._src.core.dataclasses import JsonldField
from mlcroissant._src.core.uuid import formatted_uuid_to_json
from mlcroissant._src.core.uuid import uuid_from_jsonld
from mlcroissant._src.structure_graph.base_node import Node
from mlcroissant._src.structure_graph.base_node import NodeV2

OriginalField = dataclasses.Field
dataclasses.Field = JsonldField # type: ignore


@dataclasses.dataclass(eq=False, repr=False)
class FileSet(Node):
class FileSet(NodeV2):
"""Nodes to describe a dataset FileSet (distribution)."""

contained_in: list[str] | None = None
description: str | None = None
encoding_format: str | None = None
excludes: list[str] | None = None
includes: list[str] | None = None
name: str = ""
# pytype: disable=annotation-type-mismatch
contained_in: list[str] | None = jsonld_field(
cardinality="MANY",
default_factory=list,
description=(
"Another FileObject or FileSet that this one is contained in, e.g., in the"
" case of a file extracted from an archive. When this property is present,"
" the contentUrl is evaluated as a relative path within the container"
" object"
),
from_jsonld=lambda ctx, contained_in: uuid_from_jsonld(contained_in),
input_types=[SDO.Text],
to_jsonld=lambda ctx, contained_in: [
formatted_uuid_to_json(ctx, uuid) for uuid in contained_in
],
url=SDO.containedIn,
)
description: str | None = jsonld_field(
default=None,
input_types=[SDO.Text],
url=SDO.description,
)
encoding_format: str | None = jsonld_field(
default=None,
description="The format of the file, given as a mime type.",
input_types=[SDO.Text],
required=True,
url=SDO.encodingFormat,
)
excludes: list[str] | None = jsonld_field(
cardinality="MANY",
default=None,
description="A glob pattern that specifies the files to exclude.",
input_types=[SDO.Text],
url=lambda ctx: constants.ML_COMMONS_EXCLUDES(ctx),
)
includes: list[str] | None = jsonld_field(
cardinality="MANY",
default=None,
description="A glob pattern that specifies the files to include.",
input_types=[SDO.Text],
url=lambda ctx: constants.ML_COMMONS_INCLUDES(ctx),
)
name: str = jsonld_field(
default="",
description=(
"The name of the file. As much as possible, the name should reflect the"
" name of the file as downloaded, including the file extension. e.g."
' "images.zip".'
),
input_types=[SDO.Text],
url=SDO.name,
)
# pytype: enable=annotation-type-mismatch

def __post_init__(self):
"""Checks arguments of the node."""
uuid_field = "name" if self.ctx.is_v0() else "id"
self.validate_name()
self.assert_has_mandatory_properties("includes", "encoding_format", uuid_field)

def to_json(self) -> Json:
"""Converts the `FileSet` to JSON."""
contained_in: Any = self.contained_in
if not self.ctx.is_v0() and contained_in:
contained_in = [{"@id": uuid} for uuid in contained_in]
contained_in = unbox_singleton_list(contained_in)

return remove_empty_values({
"@type": "sc:FileSet" if self.ctx.is_v0() else "cr:FileSet",
"@id": None if self.ctx.is_v0() else self.uuid,
"name": self.name,
"description": self.description,
"containedIn": contained_in,
"encodingFormat": self.encoding_format,
"excludes": unbox_singleton_list(self.excludes),
"includes": unbox_singleton_list(self.includes),
})

@classmethod
def from_jsonld(
cls,
ctx: Context,
file_set: Json,
) -> FileSet:
"""Creates a `FileSet` from JSON-LD."""
check_expected_type(ctx.issues, file_set, constants.SCHEMA_ORG_FILE_SET(ctx))
def _JSONLD_TYPE(cls, ctx: Context):
"""Gets the class' JSON-LD @type."""
return constants.SCHEMA_ORG_FILE_SET(ctx)

contained_in = box_singleton_list(
file_set.get(constants.SCHEMA_ORG_CONTAINED_IN)
)
if contained_in is not None and not ctx.is_v0():
contained_in = [uuid_from_jsonld(source) for source in contained_in]

return cls(
ctx=ctx,
contained_in=contained_in,
description=file_set.get(constants.SCHEMA_ORG_DESCRIPTION),
encoding_format=file_set.get(constants.SCHEMA_ORG_ENCODING_FORMAT),
excludes=box_singleton_list(
file_set.get(constants.ML_COMMONS_EXCLUDES(ctx))
),
includes=box_singleton_list(
file_set.get(constants.ML_COMMONS_INCLUDES(ctx))
),
name=file_set.get(constants.SCHEMA_ORG_NAME, ""),
id=uuid_from_jsonld(file_set),
)
dataclasses.Field = OriginalField # type: ignore
Loading

0 comments on commit 93374aa

Please sign in to comment.