Merge remote-tracking branch 'origin/main' into benjelloun-patch-2

mlcommons · Mar 6, 2024 · 271a205 · 271a205
2 parents bda9207 + e0d07f7
commit 271a205
Show file tree

Hide file tree

Showing 35 changed files with 1,022 additions and 662 deletions.
diff --git a/README.md b/README.md
@@ -132,6 +132,6 @@ The Task Force is open to anyone (as is the parent [Datasets working group](http
 The Task Force is co-chaired by [Omar Benjelloun](mailto:[email protected]) and [Elena Simperl](mailto:[email protected]).
 
 ## Contributors
-Albert Villanova (Hugging Face), Andrew Zaldivar (Google), Baishan Guo (Meta), Carole Jean-Wu (Meta), Ce Zhang (ETH Zurich), Costanza Conforti (Google), D. Sculley (Kaggle), Dan Brickley (Schema.Org), Eduardo Arino de la Rubia (Meta), Edward Lockhart (Deepmind), Elena Simperl (King's College London), Goeff Thomas (Kaggle), Joaquin Vanschoren (TU/Eindhoven, OpenML), Jos van der Velde (TU/Eindhoven, OpenML), Julien Chaumond (Hugging Face), Kurt Bollacker (MLCommons), Lora Aroyo (Google), Luis Oala (Dotphoton), Meg Risdal (Kaggle), Natasha Noy (Google), Newsha Ardalani (Meta), Omar Benjelloun (Google), Peter Mattson (MLCommons), Pierre Marcenac (Google), Pierre Ruyssen (Google), Pieter Gijsbers (TU/Eindhoven, OpenML), Prabhant Singh (TU/Eindhoven, OpenML), Quentin Lhoest (Hugging Face), Steffen Vogler (Bayer), Taniya Das (TU/Eindhoven, OpenML)
+Albert Villanova (Hugging Face), Andrew Zaldivar (Google), Baishan Guo (Meta), Carole Jean-Wu (Meta), Ce Zhang (ETH Zurich), Costanza Conforti (Google), D. Sculley (Kaggle), Dan Brickley (Schema.Org), Eduardo Arino de la Rubia (Meta), Edward Lockhart (Deepmind), Elena Simperl (King's College London), Goeff Thomas (Kaggle), Joan Giner-Miguelez (UOC), Joaquin Vanschoren (TU/Eindhoven, OpenML), Jos van der Velde (TU/Eindhoven, OpenML), Julien Chaumond (Hugging Face), Kurt Bollacker (MLCommons), Lora Aroyo (Google), Luis Oala (Dotphoton), Meg Risdal (Kaggle), Natasha Noy (Google), Newsha Ardalani (Meta), Omar Benjelloun (Google), Peter Mattson (MLCommons), Pierre Marcenac (Google), Pierre Ruyssen (Google), Pieter Gijsbers (TU/Eindhoven, OpenML), Prabhant Singh (TU/Eindhoven, OpenML), Quentin Lhoest (Hugging Face), Steffen Vogler (Bayer), Taniya Das (TU/Eindhoven, OpenML)
 
 Thank you for supporting Croissant! 🙂
diff --git a/datasets/1.0/recipes/enum.json b/datasets/1.0/recipes/enum.json
@@ -6,12 +6,11 @@
     "column": "cr:column",
     "conformsTo": "dct:conformsTo",
     "cr": "http://mlcommons.org/croissant/",
+    "rai": "http://mlcommons.org/croissant/RAI/",
     "data": {
       "@id": "cr:data",
       "@type": "@json"
     },
-    "dataBiases": "cr:dataBiases",
-    "dataCollection": "cr:dataCollection",
     "dataType": {
       "@id": "cr:dataType",
       "@type": "@vocab"
@@ -31,7 +30,6 @@
     "md5": "cr:md5",
     "parentField": "cr:parentField",
     "path": "cr:path",
-    "personalSensitiveInformation": "cr:personalSensitiveInformation",
     "recordSet": "cr:recordSet",
     "references": "cr:references",
     "regex": "cr:regex",
@@ -48,6 +46,11 @@
   "conformsTo": "http://mlcommons.org/croissant/1.0",
   "description": "This is a fairly minimal example, showing a way to describe enumerations.",
   "url": "https://example.com/datasets/enum/about",
+  "rai:dataCollection": "This is how data is collected",
+  "rai:dataBiases": [
+    "Bias 1 in data",
+    "Bias 2 in data"
+  ],
   "distribution": [
     {
       "@type": "cr:FileObject",

diff --git a/python/mlcroissant/mlcroissant/_src/core/constants.py b/python/mlcroissant/mlcroissant/_src/core/constants.py
@@ -64,6 +64,32 @@ def ML_COMMONS(ctx) -> rdflib.Namespace:
 ML_COMMONS_SUB_FIELD_TYPE = lambda ctx: ML_COMMONS(ctx).SubField
 ML_COMMONS_TRANSFORM = lambda ctx: ML_COMMONS(ctx).transform
 
+# Croissant RAI extension
+# V1.0 namespace
+RAI = rdflib.Namespace("http://mlcommons.org/croissant/RAI/")
+# Attributes
+ML_COMMONS_RAI_DATA_COLLECTION = RAI.dataCollection
+ML_COMMONS_RAI_DATA_COLLECTION_TYPE = RAI.dataCollectionType
+ML_COMMONS_RAI_DATA_COLLECTION_TYPE_OTHERS = RAI.dataCollectionTypeOthers
+ML_COMMONS_RAI_DATA_COLLECTION_MISSING = RAI.dataCollectionMissing
+ML_COMMONS_RAI_DATA_COLLECTION_RAW = RAI.dataCollectionRaw
+ML_COMMONS_RAI_DATA_COLLECTION_TIMEFRAME_START = RAI.dataCollectionTimeFrameStart
+ML_COMMONS_RAI_DATA_COLLECTION_TIMEFRAME_END = RAI.dataCollectionTimeFrameEnd
+ML_COMMONS_RAI_DATA_PREPROCESSING_IMPUTATION = RAI.dataPreprocessingImputation
+ML_COMMONS_RAI_DATA_PREPROCESSING_PROTOCOL = RAI.dataPeprocessingProtocol
+ML_COMMONS_RAI_DATA_PREPROCESSING_MANIPULATION = RAI.dataPreprocessingManipulation
+ML_COMMONS_RAI_DATA_ANNOTATION_PROTOCOL = RAI.dataAnnotationProtocol
+ML_COMMONS_RAI_DATA_ANNOTATION_PLATFORM = RAI.dataAnnotationPlatform
+ML_COMMONS_RAI_DATA_ANNOTATION_ANALYSIS = RAI.dataAnnotationAnalysis
+ML_COMMONS_RAI_DATA_ANNOTATION_PER_ITEM = RAI.dataAnnotationPerItem
+ML_COMMONS_RAI_DATA_ANNOTATION_DEMOGRAPHICS = RAI.dataAnnotationDemographics
+ML_COMMONS_RAI_DATA_ANNOTATION_TOOLS = RAI.dataAnnotationTools
+ML_COMMONS_RAI_DATA_USE_CASES = RAI.dataUseCases
+ML_COMMONS_RAI_DATA_BIASES = RAI.dataBiases
+ML_COMMONS_RAI_DATA_LIMITATION = RAI.dataLimitation
+ML_COMMONS_RAI_DATA_SOCIAL_IMPACT = RAI.dataSocialImpact
+ML_COMMONS_RAI_DATA_SENSITIVE = RAI.dataSensitive
+ML_COMMONS_RAI_DATA_MAINTENANCE = RAI.dataMaintenance
 
 # RDF standard URIs.
 # For "@type" key:

diff --git a/python/mlcroissant/mlcroissant/_src/core/dataclasses.py b/python/mlcroissant/mlcroissant/_src/core/dataclasses.py
@@ -0,0 +1,113 @@
+"""Utils to overload Python built-in dataclasses."""
+
+from __future__ import annotations
+
+import dataclasses
+from typing import Any, Callable, Literal
+
+from rdflib import term
+
+from mlcroissant._src.core.context import Context
+from mlcroissant._src.core.types import Json
+
+
+class JsonldField(dataclasses.Field):
+    """Overloads dataclasses.Field with JSON-LD-specific attributes."""
+
+    def __init__(
+        self,
+        *args,
+        cardinality: Literal["ONE", "MANY"],
+        description: str,
+        from_jsonld: Callable[[Context, Json], Any] | None,
+        input_types: list[Any],
+        to_jsonld: Callable[[Context, Json], Any] | None,
+        required: bool,
+        url: term.URIRef | Callable[[Context], term.URIRef],
+    ):
+        """Sets all args and kwargs."""
+        super().__init__(*args)
+        self.cardinality = cardinality
+        self.description = description
+        self.from_jsonld = from_jsonld
+        self.input_types = input_types
+        self.to_jsonld = to_jsonld
+        self.required = required
+        self.url = url
+
+    def call_from_jsonld(self, ctx: Context, value: Any):
+        """Calls `from_jsonld` in the field."""
+        if value and self.from_jsonld:
+            return self.from_jsonld(ctx, value)
+        else:
+            return value
+
+    def call_to_jsonld(self, ctx: Context, value: Any):
+        """Calls `to_jsonld` in the field."""
+        if value and self.to_jsonld:
+            return self.to_jsonld(ctx, value)
+        else:
+            return value
+
+    def call_url(self, ctx: Context) -> term.URIRef:
+        """Calls `jsonld` in the field."""
+        url = self.url
+        if isinstance(url, term.URIRef):
+            return url
+        else:
+            return url(ctx)
+
+
+def jsonld_field(
+    default=dataclasses.MISSING,
+    default_factory=dataclasses.MISSING,
+    init=True,
+    repr=True,
+    hash=None,
+    compare=True,
+    metadata=None,
+    kw_only=dataclasses.MISSING,
+    cardinality="ONE",
+    description="",
+    from_jsonld=None,
+    input_types=None,
+    to_jsonld=None,
+    required=False,
+    url=None,
+):
+    """Overloads dataclasses.field with specific attributes."""
+    if (
+        default is not dataclasses.MISSING
+        and default_factory is not dataclasses.MISSING
+    ):
+        raise ValueError("cannot specify both default and default_factory")
+    if not input_types or not isinstance(input_types, list):
+        raise ValueError(f"input type should be a non-empty list. Got: {input_types}")
+    if not url:
+        raise ValueError(f"Provide a url. Got: {url}")
+    return JsonldField(
+        default,
+        default_factory,
+        init,
+        repr,
+        hash,
+        compare,
+        metadata,
+        kw_only,
+        cardinality=cardinality,
+        description=description,
+        from_jsonld=from_jsonld,
+        input_types=input_types,
+        to_jsonld=to_jsonld,
+        required=required,
+        url=url,
+    )
+
+
+def jsonld_fields(cls_or_instance) -> list[JsonldField]:
+    """Filters the JSON-LD fields."""
+    return [
+        field
+        for field in dataclasses.fields(cls_or_instance)
+        if isinstance(field, JsonldField)
+    ]
diff --git a/python/mlcroissant/mlcroissant/_src/core/json_ld_test.py b/python/mlcroissant/mlcroissant/_src/core/json_ld_test.py
@@ -37,9 +37,8 @@ def test_make_context():
         "column": "cr:column",
         "conformsTo": "dct:conformsTo",
         "cr": "http://mlcommons.org/croissant/",
+        "rai": "http://mlcommons.org/croissant/RAI/",
         "data": {"@id": "cr:data", "@type": "@json"},
-        "dataBiases": "cr:dataBiases",
-        "dataCollection": "cr:dataCollection",
         "dataType": {"@id": "cr:dataType", "@type": "@vocab"},
         "dct": "http://purl.org/dc/terms/",
         "extract": "cr:extract",
@@ -56,7 +55,6 @@ def test_make_context():
         "md5": "cr:md5",
         "parentField": "cr:parentField",
         "path": "cr:path",
-        "personalSensitiveInformation": "cr:personalSensitiveInformation",
         "recordSet": "cr:recordSet",
         "references": "cr:references",
         "regex": "cr:regex",

diff --git a/python/mlcroissant/mlcroissant/_src/core/rdf.py b/python/mlcroissant/mlcroissant/_src/core/rdf.py
@@ -25,9 +25,8 @@ def make_context(ctx=None, **kwargs):
         "column": "cr:column",
         "conformsTo": "dct:conformsTo",
         "cr": "http://mlcommons.org/croissant/",
+        "rai": "http://mlcommons.org/croissant/RAI/",
         "data": {"@id": "cr:data", "@type": "@json"},
-        "dataBiases": "cr:dataBiases",
-        "dataCollection": "cr:dataCollection",
         "dataType": {"@id": "cr:dataType", "@type": "@vocab"},
         "dct": "http://purl.org/dc/terms/",
         "extract": "cr:extract",
@@ -44,7 +43,6 @@ def make_context(ctx=None, **kwargs):
         "md5": "sc:md5" if ctx is not None and ctx.is_v0() else "cr:md5",
         "parentField": "cr:parentField",
         "path": "cr:path",
-        "personalSensitiveInformation": "cr:personalSensitiveInformation",
         "recordSet": "cr:recordSet",
         "references": "cr:references",
         "regex": "cr:regex",

diff --git a/python/mlcroissant/mlcroissant/_src/core/uuid.py b/python/mlcroissant/mlcroissant/_src/core/uuid.py
@@ -18,6 +18,8 @@ def uuid_from_jsonld(jsonld: Json | None) -> str:
     if isinstance(jsonld, dict):
         uuid = jsonld.get("@id")
         return uuid_from_jsonld(uuid)
+    elif isinstance(jsonld, list):
+        return [uuid_from_jsonld(uuid) for uuid in jsonld]
     elif isinstance(jsonld, str):
         return uuid_to_jsonld(jsonld)
     return generate_uuid()

diff --git a/python/mlcroissant/mlcroissant/_src/datasets_test.py b/python/mlcroissant/mlcroissant/_src/datasets_test.py
@@ -26,21 +26,17 @@ def get_error_msg(folder: epath.Path):
         # Distribution.
         "distribution_bad_contained_in",
         "distribution_bad_type",
-        # When the name is missing, the context should still appear without the name.
-        "distribution_missing_name",
+        "distribution_missing_encoding_format",
         "distribution_missing_property_content_url",
         # Metadata.
         "metadata_bad_type",
-        "metadata_missing_property_name",
         # ML field.
         "mlfield_bad_source",
         "mlfield_bad_type",
-        "mlfield_missing_property_name",
         "mlfield_missing_source",
         # Record set.
         "recordset_bad_type",
         "recordset_missing_context_for_datatype",
-        "recordset_missing_property_name",
         "recordset_wrong_join",
     ],
 )
@@ -51,6 +47,23 @@ def test_static_analysis(version, folder):
     assert str(error_info.value) == get_error_msg(base_path / folder)
 
 
+# These tests refer to properties which were mandatory for Croissant 0.8, but not 1.0.
+@pytest.mark.parametrize(
+    "folder",
+    [
+        "distribution_missing_name",
+        "metadata_missing_property_name",
+        "mlfield_missing_property_name",
+        "recordset_missing_property_name",
+    ],
+)
+def test_static_analysis_0_8(folder):
+    base_path = epath.Path(__file__).parent / "tests/graphs" / "0.8"
+    with pytest.raises(ValidationError) as error_info:
+        datasets.Dataset(base_path / f"{folder}/metadata.json")
+    assert str(error_info.value) == get_error_msg(base_path / folder)
+
+
 def load_records_and_test_equality(
     version: str, dataset_name: str, record_set_name: str, num_records: int
 ):

diff --git a/python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py b/python/mlcroissant/mlcroissant/_src/structure_graph/base_node.py
@@ -9,12 +9,18 @@
 
 from mlcroissant._src.core import constants
 from mlcroissant._src.core.context import Context
+from mlcroissant._src.core.data_types import check_expected_type
+from mlcroissant._src.core.dataclasses import jsonld_fields
 from mlcroissant._src.core.issues import Issues
+from mlcroissant._src.core.json_ld import box_singleton_list
+from mlcroissant._src.core.json_ld import remove_empty_values
+from mlcroissant._src.core.json_ld import unbox_singleton_list
 from mlcroissant._src.core.types import Json
 from mlcroissant._src.core.uuid import generate_uuid
+from mlcroissant._src.core.uuid import uuid_from_jsonld
 
-ID_REGEX = "[a-zA-Z0-9\\-_\\.]+"
-_MAX_ID_LENGTH = 255
+NAME_REGEX = "[a-zA-Z0-9\\-_\\.]+"
+_MAX_NAME_LENGTH = 255
 
 
 @dataclasses.dataclass(eq=False, repr=False)
@@ -44,7 +50,8 @@ class Node(abc.ABC):
 
     def __post_init__(self):
         """Checks for common properties between all nodes."""
-        self.assert_has_mandatory_properties("name", "id")
+        uuid_field = "name" if self.ctx.is_v0() else "id"
+        self.assert_has_mandatory_properties(uuid_field)
 
     def assert_has_mandatory_properties(self, *mandatory_properties: str):
         """Checks a node in the graph for existing properties with constraints.
@@ -222,13 +229,17 @@ def validate_name(self):
             return
         if not name:
             # This case is already checked for in every node's __post_init__ as `name`
-            # is a mandatory parameter.
+            # is a mandatory parameter for Croissant 0.8
             return
-        if len(name) > _MAX_ID_LENGTH:
+        # For Croissant >= 1.0 compliant datasets, we don't enforce any more constraints
+        # on names.
+        if not self.ctx.is_v0():
+            return
+        if len(name) > _MAX_NAME_LENGTH:
             self.add_error(
-                f'The name "{name}" is too long (>{_MAX_ID_LENGTH} characters).'
+                f'The name "{name}" is too long (>{_MAX_NAME_LENGTH} characters).'
             )
-        regex = re.compile(rf"^{ID_REGEX}$")
+        regex = re.compile(rf"^{NAME_REGEX}$")
         if not regex.match(name):
             self.add_error(f'The name "{name}" contains forbidden characters.')
 
@@ -265,3 +276,50 @@ def __deepcopy__(self, memo):
         copy = self.__class__(**kwargs)  # pytype: disable=not-instantiable
         memo[id(self)] = copy
         return copy
+
+
+class NodeV2(Node):
+    """Extends Node. When the migration is complete, merge `Node` and `NodeV2`."""
+
+    def to_json(self) -> Json:
+        """Converts the Python class to JSON."""
+        cls = self.__class__
+        jsonld = {
+            "@type": self.ctx.rdf.shorten_value(cls._JSONLD_TYPE(self.ctx)),
+            "@id": None if self.ctx.is_v0() else self.id,
+        }
+        for field in jsonld_fields(self):
+            url = field.call_url(self.ctx)
+            key = url.split("/")[-1]
+            value = getattr(self, field.name)
+            value = field.call_to_jsonld(self.ctx, value)
+            if field.cardinality == "MANY" and field.name != "fields":
+                value = unbox_singleton_list(value)
+            jsonld[key] = value
+        return remove_empty_values(jsonld)
+
+    @classmethod
+    def from_jsonld(cls, ctx: Context, jsonld: Json):
+        """Creates a Python class from JSON-LD."""
+        if isinstance(jsonld, list):
+            return [cls.from_jsonld(ctx, el) for el in jsonld]
+        check_expected_type(ctx.issues, jsonld, cls._JSONLD_TYPE(ctx))
+        kwargs = {}
+        for field in jsonld_fields(cls):
+            url = field.call_url(ctx)
+            value = jsonld.get(url)
+            value = field.call_from_jsonld(ctx, value)
+            if field.cardinality == "MANY":
+                value = box_singleton_list(value)
+            if value:
+                kwargs[field.name] = value
+        return cls(
+            ctx=ctx,
+            id=uuid_from_jsonld(jsonld),
+            **kwargs,
+        )
+
+    @classmethod
+    def _JSONLD_TYPE(cls, ctx: Context):
+        del ctx
+        raise NotImplementedError("Output the right JSON-LD type.")