From c5eb1d630d3be73f540d2ff6c2a8b229f747a551 Mon Sep 17 00:00:00 2001 From: Ivan Longin Date: Wed, 13 Nov 2024 15:46:20 +0100 Subject: [PATCH] Adding `uuid` to dataset version (#587) * added uuid to dataset version * returning all tests * fixing pull and having same uuid locally --- src/datachain/catalog/catalog.py | 5 +++++ src/datachain/data_storage/metastore.py | 4 ++++ src/datachain/dataset.py | 5 +++++ src/datachain/lib/dataset_info.py | 3 +++ tests/func/test_datasets.py | 4 ++++ tests/func/test_pull.py | 4 ++++ 6 files changed, 25 insertions(+) diff --git a/src/datachain/catalog/catalog.py b/src/datachain/catalog/catalog.py index f48e0da1e..1a6620c9a 100644 --- a/src/datachain/catalog/catalog.py +++ b/src/datachain/catalog/catalog.py @@ -769,6 +769,7 @@ def create_dataset( create_rows: Optional[bool] = True, validate_version: Optional[bool] = True, listing: Optional[bool] = False, + uuid: Optional[str] = None, ) -> "DatasetRecord": """ Creates new dataset of a specific version. @@ -816,6 +817,7 @@ def create_dataset( query_script=query_script, create_rows_table=create_rows, columns=columns, + uuid=uuid, ) def create_new_dataset_version( @@ -832,6 +834,7 @@ def create_new_dataset_version( script_output="", create_rows_table=True, job_id: Optional[str] = None, + uuid: Optional[str] = None, ) -> DatasetRecord: """ Creates dataset version if it doesn't exist. @@ -855,6 +858,7 @@ def create_new_dataset_version( schema=schema, job_id=job_id, ignore_if_exists=True, + uuid=uuid, ) if create_rows_table: @@ -1400,6 +1404,7 @@ def _instantiate_dataset(): columns=columns, feature_schema=remote_dataset_version.feature_schema, validate_version=False, + uuid=remote_dataset_version.uuid, ) # asking remote to export dataset rows table to s3 and to return signed diff --git a/src/datachain/data_storage/metastore.py b/src/datachain/data_storage/metastore.py index 8ca56892e..7afbb3e72 100644 --- a/src/datachain/data_storage/metastore.py +++ b/src/datachain/data_storage/metastore.py @@ -138,6 +138,7 @@ def create_dataset_version( # noqa: PLR0913 size: Optional[int] = None, preview: Optional[list[dict]] = None, job_id: Optional[str] = None, + uuid: Optional[str] = None, ) -> DatasetRecord: """Creates new dataset version.""" @@ -352,6 +353,7 @@ def _datasets_versions_columns(cls) -> list["SchemaItem"]: """Datasets versions table columns.""" return [ Column("id", Integer, primary_key=True), + Column("uuid", Text, nullable=False, default=uuid4()), Column( "dataset_id", Integer, @@ -545,6 +547,7 @@ def create_dataset_version( # noqa: PLR0913 size: Optional[int] = None, preview: Optional[list[dict]] = None, job_id: Optional[str] = None, + uuid: Optional[str] = None, conn=None, ) -> DatasetRecord: """Creates new dataset version.""" @@ -555,6 +558,7 @@ def create_dataset_version( # noqa: PLR0913 query = self._datasets_versions_insert().values( dataset_id=dataset.id, + uuid=uuid or str(uuid4()), version=version, status=status, feature_schema=json.dumps(feature_schema or {}), diff --git a/src/datachain/dataset.py b/src/datachain/dataset.py index ea3cbb89a..a7d28be1f 100644 --- a/src/datachain/dataset.py +++ b/src/datachain/dataset.py @@ -163,6 +163,7 @@ class DatasetStatus: @dataclass class DatasetVersion: id: int + uuid: str dataset_id: int version: int status: int @@ -184,6 +185,7 @@ class DatasetVersion: def parse( # noqa: PLR0913 cls: type[V], id: int, + uuid: str, dataset_id: int, version: int, status: int, @@ -203,6 +205,7 @@ def parse( # noqa: PLR0913 ): return cls( id, + uuid, dataset_id, version, status, @@ -306,6 +309,7 @@ def parse( # noqa: PLR0913 query_script: str, schema: str, version_id: int, + version_uuid: str, version_dataset_id: int, version: int, version_status: int, @@ -331,6 +335,7 @@ def parse( # noqa: PLR0913 dataset_version = DatasetVersion.parse( version_id, + version_uuid, version_dataset_id, version, version_status, diff --git a/src/datachain/lib/dataset_info.py b/src/datachain/lib/dataset_info.py index fad8673db..ece6d742a 100644 --- a/src/datachain/lib/dataset_info.py +++ b/src/datachain/lib/dataset_info.py @@ -1,6 +1,7 @@ import json from datetime import datetime from typing import TYPE_CHECKING, Any, Optional, Union +from uuid import uuid4 from pydantic import Field, field_validator @@ -15,6 +16,7 @@ class DatasetInfo(DataModel): name: str + uuid: str = Field(default=str(uuid4())) version: int = Field(default=1) status: int = Field(default=DatasetStatus.CREATED) created_at: datetime = Field(default=TIME_ZERO) @@ -60,6 +62,7 @@ def from_models( job: Optional[Job], ) -> "Self": return cls( + uuid=version.uuid, name=dataset.name, version=version.version, status=version.status, diff --git a/tests/func/test_datasets.py b/tests/func/test_datasets.py index d637fea58..03b90ecc5 100644 --- a/tests/func/test_datasets.py +++ b/tests/func/test_datasets.py @@ -56,6 +56,7 @@ def test_create_dataset_no_version_specified(cloud_test_catalog, create_rows): assert dataset.schema["similarity"] == Float32 assert dataset_version.schema["similarity"] == Float32 assert dataset_version.status == DatasetStatus.PENDING + assert dataset_version.uuid assert dataset.status == DatasetStatus.CREATED # dataset status is deprecated if create_rows: assert dataset_version.num_objects == 0 @@ -85,6 +86,7 @@ def test_create_dataset_with_explicit_version(cloud_test_catalog, create_rows): assert dataset.schema["similarity"] == Float32 assert dataset_version.schema["similarity"] == Float32 assert dataset_version.status == DatasetStatus.PENDING + assert dataset_version.uuid assert dataset.status == DatasetStatus.CREATED if create_rows: assert dataset_version.num_objects == 0 @@ -178,6 +180,7 @@ def test_create_dataset_from_sources(listed_bucket, cloud_test_catalog): assert dataset_version.error_stack == "" assert dataset_version.script_output == "" assert dataset_version.sources == f"{src_uri}/dogs/*" + assert dataset_version.uuid dr = catalog.warehouse.schema.dataset_row_cls sys_schema = {c.name: type(c.type) for c in dr.sys_columns()} @@ -214,6 +217,7 @@ def test_create_dataset_from_sources_dataset(cloud_test_catalog, dogs_dataset): assert dataset_version.error_stack == "" assert dataset_version.script_output == "" assert dataset_version.sources == f"ds://{dogs_dataset.name}" + assert dataset_version.uuid dr = catalog.warehouse.schema.dataset_row_cls sys_schema = {c.name: type(c.type) for c in dr.sys_columns()} diff --git a/tests/func/test_pull.py b/tests/func/test_pull.py index bb720fbf5..2be9936ad 100644 --- a/tests/func/test_pull.py +++ b/tests/func/test_pull.py @@ -13,6 +13,8 @@ from tests.data import ENTRIES from tests.utils import assert_row_names, skip_if_not_sqlite +DATASET_UUID = "20f5a2f1-fc9a-4e36-8b91-5a530f289451" + @pytest.fixture(autouse=True) def studio_config(): @@ -90,6 +92,7 @@ def schema(): def remote_dataset_version(schema, dataset_rows): return { "id": 1, + "uuid": DATASET_UUID, "dataset_id": 1, "version": 1, "status": 4, @@ -179,6 +182,7 @@ def test_pull_dataset_success( assert dataset_version.schema assert dataset_version.num_objects == 4 assert dataset_version.size == 15 + assert dataset_version.uuid == DATASET_UUID assert_row_names( catalog,