Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into benjelloun-patch-2
Browse files Browse the repository at this point in the history
  • Loading branch information
benjelloun authored Mar 6, 2024
2 parents bda9207 + e0d07f7 commit 271a205
Show file tree
Hide file tree
Showing 35 changed files with 1,022 additions and 662 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,6 @@ The Task Force is open to anyone (as is the parent [Datasets working group](http
The Task Force is co-chaired by [Omar Benjelloun](mailto:[email protected]) and [Elena Simperl](mailto:[email protected]).

## Contributors
Albert Villanova (Hugging Face), Andrew Zaldivar (Google), Baishan Guo (Meta), Carole Jean-Wu (Meta), Ce Zhang (ETH Zurich), Costanza Conforti (Google), D. Sculley (Kaggle), Dan Brickley (Schema.Org), Eduardo Arino de la Rubia (Meta), Edward Lockhart (Deepmind), Elena Simperl (King's College London), Goeff Thomas (Kaggle), Joaquin Vanschoren (TU/Eindhoven, OpenML), Jos van der Velde (TU/Eindhoven, OpenML), Julien Chaumond (Hugging Face), Kurt Bollacker (MLCommons), Lora Aroyo (Google), Luis Oala (Dotphoton), Meg Risdal (Kaggle), Natasha Noy (Google), Newsha Ardalani (Meta), Omar Benjelloun (Google), Peter Mattson (MLCommons), Pierre Marcenac (Google), Pierre Ruyssen (Google), Pieter Gijsbers (TU/Eindhoven, OpenML), Prabhant Singh (TU/Eindhoven, OpenML), Quentin Lhoest (Hugging Face), Steffen Vogler (Bayer), Taniya Das (TU/Eindhoven, OpenML)
Albert Villanova (Hugging Face), Andrew Zaldivar (Google), Baishan Guo (Meta), Carole Jean-Wu (Meta), Ce Zhang (ETH Zurich), Costanza Conforti (Google), D. Sculley (Kaggle), Dan Brickley (Schema.Org), Eduardo Arino de la Rubia (Meta), Edward Lockhart (Deepmind), Elena Simperl (King's College London), Goeff Thomas (Kaggle), Joan Giner-Miguelez (UOC), Joaquin Vanschoren (TU/Eindhoven, OpenML), Jos van der Velde (TU/Eindhoven, OpenML), Julien Chaumond (Hugging Face), Kurt Bollacker (MLCommons), Lora Aroyo (Google), Luis Oala (Dotphoton), Meg Risdal (Kaggle), Natasha Noy (Google), Newsha Ardalani (Meta), Omar Benjelloun (Google), Peter Mattson (MLCommons), Pierre Marcenac (Google), Pierre Ruyssen (Google), Pieter Gijsbers (TU/Eindhoven, OpenML), Prabhant Singh (TU/Eindhoven, OpenML), Quentin Lhoest (Hugging Face), Steffen Vogler (Bayer), Taniya Das (TU/Eindhoven, OpenML)

Thank you for supporting Croissant! 🙂
9 changes: 6 additions & 3 deletions datasets/1.0/recipes/enum.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataBiases": "cr:dataBiases",
"dataCollection": "cr:dataCollection",
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
Expand All @@ -31,7 +30,6 @@
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"personalSensitiveInformation": "cr:personalSensitiveInformation",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
Expand All @@ -48,6 +46,11 @@
"conformsTo": "http://mlcommons.org/croissant/1.0",
"description": "This is a fairly minimal example, showing a way to describe enumerations.",
"url": "https://example.com/datasets/enum/about",
"rai:dataCollection": "This is how data is collected",
"rai:dataBiases": [
"Bias 1 in data",
"Bias 2 in data"
],
"distribution": [
{
"@type": "cr:FileObject",
Expand Down
26 changes: 26 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,32 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
ML_COMMONS_SUB_FIELD_TYPE = lambda ctx: ML_COMMONS(ctx).SubField
ML_COMMONS_TRANSFORM = lambda ctx: ML_COMMONS(ctx).transform

# Croissant RAI extension
# V1.0 namespace
RAI = rdflib.Namespace("http://mlcommons.org/croissant/RAI/")
# Attributes
ML_COMMONS_RAI_DATA_COLLECTION = RAI.dataCollection
ML_COMMONS_RAI_DATA_COLLECTION_TYPE = RAI.dataCollectionType
ML_COMMONS_RAI_DATA_COLLECTION_TYPE_OTHERS = RAI.dataCollectionTypeOthers
ML_COMMONS_RAI_DATA_COLLECTION_MISSING = RAI.dataCollectionMissing
ML_COMMONS_RAI_DATA_COLLECTION_RAW = RAI.dataCollectionRaw
ML_COMMONS_RAI_DATA_COLLECTION_TIMEFRAME_START = RAI.dataCollectionTimeFrameStart
ML_COMMONS_RAI_DATA_COLLECTION_TIMEFRAME_END = RAI.dataCollectionTimeFrameEnd
ML_COMMONS_RAI_DATA_PREPROCESSING_IMPUTATION = RAI.dataPreprocessingImputation
ML_COMMONS_RAI_DATA_PREPROCESSING_PROTOCOL = RAI.dataPeprocessingProtocol
ML_COMMONS_RAI_DATA_PREPROCESSING_MANIPULATION = RAI.dataPreprocessingManipulation
ML_COMMONS_RAI_DATA_ANNOTATION_PROTOCOL = RAI.dataAnnotationProtocol
ML_COMMONS_RAI_DATA_ANNOTATION_PLATFORM = RAI.dataAnnotationPlatform
ML_COMMONS_RAI_DATA_ANNOTATION_ANALYSIS = RAI.dataAnnotationAnalysis
ML_COMMONS_RAI_DATA_ANNOTATION_PER_ITEM = RAI.dataAnnotationPerItem
ML_COMMONS_RAI_DATA_ANNOTATION_DEMOGRAPHICS = RAI.dataAnnotationDemographics
ML_COMMONS_RAI_DATA_ANNOTATION_TOOLS = RAI.dataAnnotationTools
ML_COMMONS_RAI_DATA_USE_CASES = RAI.dataUseCases
ML_COMMONS_RAI_DATA_BIASES = RAI.dataBiases
ML_COMMONS_RAI_DATA_LIMITATION = RAI.dataLimitation
ML_COMMONS_RAI_DATA_SOCIAL_IMPACT = RAI.dataSocialImpact
ML_COMMONS_RAI_DATA_SENSITIVE = RAI.dataSensitive
ML_COMMONS_RAI_DATA_MAINTENANCE = RAI.dataMaintenance

# RDF standard URIs.
# For "@type" key:
Expand Down
113 changes: 113 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Utils to overload Python built-in dataclasses."""

from __future__ import annotations

import dataclasses
from typing import Any, Callable, Literal

from rdflib import term

from mlcroissant._src.core.context import Context
from mlcroissant._src.core.types import Json


class JsonldField(dataclasses.Field):
"""Overloads dataclasses.Field with JSON-LD-specific attributes."""

def __init__(
self,
*args,
cardinality: Literal["ONE", "MANY"],
description: str,
from_jsonld: Callable[[Context, Json], Any] | None,
input_types: list[Any],
to_jsonld: Callable[[Context, Json], Any] | None,
required: bool,
url: term.URIRef | Callable[[Context], term.URIRef],
):
"""Sets all args and kwargs."""
super().__init__(*args)
self.cardinality = cardinality
self.description = description
self.from_jsonld = from_jsonld
self.input_types = input_types
self.to_jsonld = to_jsonld
self.required = required
self.url = url

def call_from_jsonld(self, ctx: Context, value: Any):
"""Calls `from_jsonld` in the field."""
if value and self.from_jsonld:
return self.from_jsonld(ctx, value)
else:
return value

def call_to_jsonld(self, ctx: Context, value: Any):
"""Calls `to_jsonld` in the field."""
if value and self.to_jsonld:
return self.to_jsonld(ctx, value)
else:
return value

def call_url(self, ctx: Context) -> term.URIRef:
"""Calls `jsonld` in the field."""
url = self.url
if isinstance(url, term.URIRef):
return url
else:
return url(ctx)


def jsonld_field(
default=dataclasses.MISSING,
default_factory=dataclasses.MISSING,
init=True,
repr=True,
hash=None,
compare=True,
metadata=None,
kw_only=dataclasses.MISSING,
cardinality="ONE",
description="",
from_jsonld=None,
input_types=None,
to_jsonld=None,
required=False,
url=None,
):
"""Overloads dataclasses.field with specific attributes."""
if (
default is not dataclasses.MISSING
and default_factory is not dataclasses.MISSING
):
raise ValueError("cannot specify both default and default_factory")
if not input_types or not isinstance(input_types, list):
raise ValueError(f"input type should be a non-empty list. Got: {input_types}")
if not url:
raise ValueError(f"Provide a url. Got: {url}")
return JsonldField(
default,
default_factory,
init,
repr,
hash,
compare,
metadata,
kw_only,
cardinality=cardinality,
description=description,
from_jsonld=from_jsonld,
input_types=input_types,
to_jsonld=to_jsonld,
required=required,
url=url,
)


def jsonld_fields(cls_or_instance) -> list[JsonldField]:
"""Filters the JSON-LD fields."""
return [
field
for field in dataclasses.fields(cls_or_instance)
if isinstance(field, JsonldField)
]
4 changes: 1 addition & 3 deletions python/mlcroissant/mlcroissant/_src/core/json_ld_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,8 @@ def test_make_context():
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
"data": {"@id": "cr:data", "@type": "@json"},
"dataBiases": "cr:dataBiases",
"dataCollection": "cr:dataCollection",
"dataType": {"@id": "cr:dataType", "@type": "@vocab"},
"dct": "http://purl.org/dc/terms/",
"extract": "cr:extract",
Expand All @@ -56,7 +55,6 @@ def test_make_context():
"md5": "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"personalSensitiveInformation": "cr:personalSensitiveInformation",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
Expand Down
4 changes: 1 addition & 3 deletions python/mlcroissant/mlcroissant/_src/core/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@ def make_context(ctx=None, **kwargs):
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
"data": {"@id": "cr:data", "@type": "@json"},
"dataBiases": "cr:dataBiases",
"dataCollection": "cr:dataCollection",
"dataType": {"@id": "cr:dataType", "@type": "@vocab"},
"dct": "http://purl.org/dc/terms/",
"extract": "cr:extract",
Expand All @@ -44,7 +43,6 @@ def make_context(ctx=None, **kwargs):
"md5": "sc:md5" if ctx is not None and ctx.is_v0() else "cr:md5",
"parentField": "cr:parentField",
"path": "cr:path",
"personalSensitiveInformation": "cr:personalSensitiveInformation",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
Expand Down
2 changes: 2 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/uuid.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def uuid_from_jsonld(jsonld: Json | None) -> str:
if isinstance(jsonld, dict):
uuid = jsonld.get("@id")
return uuid_from_jsonld(uuid)
elif isinstance(jsonld, list):
return [uuid_from_jsonld(uuid) for uuid in jsonld]
elif isinstance(jsonld, str):
return uuid_to_jsonld(jsonld)
return generate_uuid()
Expand Down
23 changes: 18 additions & 5 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,17 @@ def get_error_msg(folder: epath.Path):
# Distribution.
"distribution_bad_contained_in",
"distribution_bad_type",
# When the name is missing, the context should still appear without the name.
"distribution_missing_name",
"distribution_missing_encoding_format",
"distribution_missing_property_content_url",
# Metadata.
"metadata_bad_type",
"metadata_missing_property_name",
# ML field.
"mlfield_bad_source",
"mlfield_bad_type",
"mlfield_missing_property_name",
"mlfield_missing_source",
# Record set.
"recordset_bad_type",
"recordset_missing_context_for_datatype",
"recordset_missing_property_name",
"recordset_wrong_join",
],
)
Expand All @@ -51,6 +47,23 @@ def test_static_analysis(version, folder):
assert str(error_info.value) == get_error_msg(base_path / folder)


# These tests refer to properties which were mandatory for Croissant 0.8, but not 1.0.
@pytest.mark.parametrize(
"folder",
[
"distribution_missing_name",
"metadata_missing_property_name",
"mlfield_missing_property_name",
"recordset_missing_property_name",
],
)
def test_static_analysis_0_8(folder):
base_path = epath.Path(__file__).parent / "tests/graphs" / "0.8"
with pytest.raises(ValidationError) as error_info:
datasets.Dataset(base_path / f"{folder}/metadata.json")
assert str(error_info.value) == get_error_msg(base_path / folder)


def load_records_and_test_equality(
version: str, dataset_name: str, record_set_name: str, num_records: int
):
Expand Down
72 changes: 65 additions & 7 deletions python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,18 @@

from mlcroissant._src.core import constants
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.data_types import check_expected_type
from mlcroissant._src.core.dataclasses import jsonld_fields
from mlcroissant._src.core.issues import Issues
from mlcroissant._src.core.json_ld import box_singleton_list
from mlcroissant._src.core.json_ld import remove_empty_values
from mlcroissant._src.core.json_ld import unbox_singleton_list
from mlcroissant._src.core.types import Json
from mlcroissant._src.core.uuid import generate_uuid
from mlcroissant._src.core.uuid import uuid_from_jsonld

ID_REGEX = "[a-zA-Z0-9\\-_\\.]+"
_MAX_ID_LENGTH = 255
NAME_REGEX = "[a-zA-Z0-9\\-_\\.]+"
_MAX_NAME_LENGTH = 255


@dataclasses.dataclass(eq=False, repr=False)
Expand Down Expand Up @@ -44,7 +50,8 @@ class Node(abc.ABC):

def __post_init__(self):
"""Checks for common properties between all nodes."""
self.assert_has_mandatory_properties("name", "id")
uuid_field = "name" if self.ctx.is_v0() else "id"
self.assert_has_mandatory_properties(uuid_field)

def assert_has_mandatory_properties(self, *mandatory_properties: str):
"""Checks a node in the graph for existing properties with constraints.
Expand Down Expand Up @@ -222,13 +229,17 @@ def validate_name(self):
return
if not name:
# This case is already checked for in every node's __post_init__ as `name`
# is a mandatory parameter.
# is a mandatory parameter for Croissant 0.8
return
if len(name) > _MAX_ID_LENGTH:
# For Croissant >= 1.0 compliant datasets, we don't enforce any more constraints
# on names.
if not self.ctx.is_v0():
return
if len(name) > _MAX_NAME_LENGTH:
self.add_error(
f'The name "{name}" is too long (>{_MAX_ID_LENGTH} characters).'
f'The name "{name}" is too long (>{_MAX_NAME_LENGTH} characters).'
)
regex = re.compile(rf"^{ID_REGEX}$")
regex = re.compile(rf"^{NAME_REGEX}$")
if not regex.match(name):
self.add_error(f'The name "{name}" contains forbidden characters.')

Expand Down Expand Up @@ -265,3 +276,50 @@ def __deepcopy__(self, memo):
copy = self.__class__(**kwargs) # pytype: disable=not-instantiable
memo[id(self)] = copy
return copy


class NodeV2(Node):
"""Extends Node. When the migration is complete, merge `Node` and `NodeV2`."""

def to_json(self) -> Json:
"""Converts the Python class to JSON."""
cls = self.__class__
jsonld = {
"@type": self.ctx.rdf.shorten_value(cls._JSONLD_TYPE(self.ctx)),
"@id": None if self.ctx.is_v0() else self.id,
}
for field in jsonld_fields(self):
url = field.call_url(self.ctx)
key = url.split("/")[-1]
value = getattr(self, field.name)
value = field.call_to_jsonld(self.ctx, value)
if field.cardinality == "MANY" and field.name != "fields":
value = unbox_singleton_list(value)
jsonld[key] = value
return remove_empty_values(jsonld)

@classmethod
def from_jsonld(cls, ctx: Context, jsonld: Json):
"""Creates a Python class from JSON-LD."""
if isinstance(jsonld, list):
return [cls.from_jsonld(ctx, el) for el in jsonld]
check_expected_type(ctx.issues, jsonld, cls._JSONLD_TYPE(ctx))
kwargs = {}
for field in jsonld_fields(cls):
url = field.call_url(ctx)
value = jsonld.get(url)
value = field.call_from_jsonld(ctx, value)
if field.cardinality == "MANY":
value = box_singleton_list(value)
if value:
kwargs[field.name] = value
return cls(
ctx=ctx,
id=uuid_from_jsonld(jsonld),
**kwargs,
)

@classmethod
def _JSONLD_TYPE(cls, ctx: Context):
del ctx
raise NotImplementedError("Output the right JSON-LD type.")
Loading

0 comments on commit 271a205

Please sign in to comment.