iterative · ilongin · Jan 8, 2025 · Dec 17, 2024 · Dec 17, 2024 · Dec 18, 2024
diff --git a/src/datachain/lib/diff.py b/src/datachain/lib/diff.py
@@ -1,6 +1,7 @@
 import random
 import string
 from collections.abc import Sequence
+from enum import Enum
 from typing import TYPE_CHECKING, Optional, Union
 
 import sqlalchemy as sa
@@ -16,6 +17,21 @@
 C = Column
 
 
+def get_status_col_name() -> str:
+    """Returns new unique status col name"""
+    return "diff_" + "".join(
+        random.choice(string.ascii_letters)  # noqa: S311
+        for _ in range(10)
+    )
+
+
+class CompareStatus(str, Enum):
+    ADDED = "A"
+    DELETED = "D"
+    MODIFIED = "M"
+    UNCHANGED = "U"
+
+
 def compare(  # noqa: PLR0912, PLR0915, C901
     left: "DataChain",
     right: "DataChain",
@@ -72,13 +88,10 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
             "At least one of added, deleted, modified, unchanged flags must be set"
         )
 
-    # we still need status column for internal implementation even if not
-    # needed in output
     need_status_col = bool(status_col)
-    status_col = status_col or "diff_" + "".join(
-        random.choice(string.ascii_letters)  # noqa: S311
-        for _ in range(10)
-    )
+    # we still need status column for internal implementation even if not
+    # needed in the output
+    status_col = status_col or get_status_col_name()
 
     # calculate on and compare column names
     right_on = right_on or on
@@ -112,25 +125,27 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
                 for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
             ]
         )
-        diff_cond.append((added_cond, "A"))
+        diff_cond.append((added_cond, CompareStatus.ADDED))
     if modified and compare:
         modified_cond = sa.or_(
             *[
                 C(c) != C(f"{_rprefix(c, rc)}{rc}")
                 for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
             ]
         )
-        diff_cond.append((modified_cond, "M"))
+        diff_cond.append((modified_cond, CompareStatus.MODIFIED))
     if unchanged and compare:
         unchanged_cond = sa.and_(
             *[
                 C(c) == C(f"{_rprefix(c, rc)}{rc}")
                 for c, rc in zip(compare, right_compare)  # type: ignore[arg-type]
             ]
         )
-        diff_cond.append((unchanged_cond, "U"))
+        diff_cond.append((unchanged_cond, CompareStatus.UNCHANGED))
 
-    diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
+    diff = sa.case(*diff_cond, else_=None if compare else CompareStatus.MODIFIED).label(
+        status_col
+    )
     diff.type = String()
 
     left_right_merge = left.merge(
@@ -145,7 +160,7 @@ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
         )
     )
 
-    diff_col = sa.literal("D").label(status_col)
+    diff_col = sa.literal(CompareStatus.DELETED).label(status_col)
     diff_col.type = String()
 
     right_left_merge = right.merge(

diff --git a/src/datachain/toolkit/__init__.py b/src/datachain/toolkit/__init__.py
@@ -1,3 +1,4 @@
+from .diff import compare
 from .split import train_test_split
 
-__all__ = ["train_test_split"]
+__all__ = ["compare", "train_test_split"]
diff --git a/src/datachain/toolkit/diff.py b/src/datachain/toolkit/diff.py
@@ -0,0 +1,111 @@
+from collections.abc import Sequence
+from typing import TYPE_CHECKING, Optional, Union
+
+from datachain.lib.diff import CompareStatus, get_status_col_name
+from datachain.lib.diff import compare as chain_compare
+from datachain.query.schema import Column
+
+if TYPE_CHECKING:
+    from datachain.lib.dc import DataChain
+
+
+C = Column
+
+
+def compare(
+    left: "DataChain",
+    right: "DataChain",
+    on: Union[str, Sequence[str]],
+    right_on: Optional[Union[str, Sequence[str]]] = None,
+    compare: Optional[Union[str, Sequence[str]]] = None,
+    right_compare: Optional[Union[str, Sequence[str]]] = None,
+    added: bool = True,
+    deleted: bool = True,
+    modified: bool = True,
+    unchanged: bool = False,
+) -> dict[str, "DataChain"]:
+    """Comparing two chains by identifying rows that are added, deleted, modified
+    or unchanged. Result is the new chain that has additional column with possible
+    values: `A`, `D`, `M`, `U` representing added, deleted, modified and unchanged
+    rows respectively. Note that if only one "status" is asked, by setting proper
+    flags, this additional column is not created as it would have only one value
+    for all rows. Beside additional diff column, new chain has schema of the chain
+    on which method was called.
+
+    Comparing two chains and returning multiple chains, one for each of `added`,
+    `deleted`, `modified` and `unchanged` status. Result is returned in form of
+    dictionary where each item represents one of the statuses and key values
+    are `A`, `D`, `M`, `U` corresponding. Note that status column is not in the
+    resulting chains.
+
+    Parameters:
+        left: Chain to calculate diff on.
+        right: Chain to calculate diff from.
+        on: Column or list of columns to match on. If both chains have the
+            same columns then this column is enough for the match. Otherwise,
+            `right_on` parameter has to specify the columns for the other chain.
+            This value is used to find corresponding row in other dataset. If not
+            found there, row is considered as added (or removed if vice versa), and
+            if found then row can be either modified or unchanged.
+        right_on: Optional column or list of columns
+            for the `other` to match.
+        compare: Column or list of columns to compare on. If both chains have
+            the same columns then this column is enough for the compare. Otherwise,
+            `right_compare` parameter has to specify the columns for the other
+            chain. This value is used to see if row is modified or unchanged. If
+            not set, all columns will be used for comparison
+        right_compare: Optional column or list of columns
+                for the `other` to compare to.
+        added (bool): Whether to return chain containing only added rows.
+        deleted (bool): Whether to return chain containing only deleted rows.
+        modified (bool): Whether to return chain containing only modified rows.
+        unchanged (bool): Whether to return chain containing only unchanged rows.
+
+    Example:
+        ```py
+        chains = compare(
+            persons,
+            new_persons,
+            on=["id"],
+            right_on=["other_id"],
+            compare=["name"],
+            added=True,
+            deleted=True,
+            modified=True,
+            unchanged=True,
+        )
+        ```
+    """
+    status_col = get_status_col_name()
+
+    res = chain_compare(
+        left,
+        right,
+        on,
+        right_on=right_on,
+        compare=compare,
+        right_compare=right_compare,
+        added=added,
+        deleted=deleted,
+        modified=modified,
+        unchanged=unchanged,
+        status_col=status_col,
+    )
+
+    chains = {}
+
+    def filter_by_status(compare_status) -> "DataChain":
+        return res.filter(C(status_col) == compare_status).select_except(status_col)
+
+    if added:
+        chains[CompareStatus.ADDED.value] = filter_by_status(CompareStatus.ADDED)
+    if deleted:
+        chains[CompareStatus.DELETED.value] = filter_by_status(CompareStatus.DELETED)
+    if modified:
+        chains[CompareStatus.MODIFIED.value] = filter_by_status(CompareStatus.MODIFIED)
+    if unchanged:
+        chains[CompareStatus.UNCHANGED.value] = filter_by_status(
+            CompareStatus.UNCHANGED
+        )
+
+    return chains
diff --git a/tests/func/test_toolkit.py b/tests/func/test_toolkit.py
@@ -1,6 +1,7 @@
 import pytest
 
-from datachain.toolkit import train_test_split
+from datachain.lib.dc import DataChain
+from datachain.toolkit import compare, train_test_split
 
 
 @pytest.mark.parametrize(
@@ -49,3 +50,67 @@ def test_train_test_split_errors(not_random_ds):
         train_test_split(not_random_ds, [0.5])
     with pytest.raises(ValueError, match="Weights should be non-negative"):
         train_test_split(not_random_ds, [-1, 1])
+
+
+@pytest.mark.parametrize("added", (True, False))
+@pytest.mark.parametrize("deleted", (True, False))
+@pytest.mark.parametrize("modified", (True, False))
+@pytest.mark.parametrize("unchanged", (True, False))
+def test_compare(test_session, added, deleted, modified, unchanged):
+    ds1 = DataChain.from_values(
+        id=[1, 2, 4],
+        name=["John1", "Doe", "Andy"],
+        session=test_session,
+    ).save("ds1")
+
+    ds2 = DataChain.from_values(
+        id=[1, 3, 4],
+        name=["John", "Mark", "Andy"],
+        session=test_session,
+    ).save("ds2")
+
+    if not any([added, deleted, modified, unchanged]):
+        with pytest.raises(ValueError) as exc_info:
+            compare(
+                ds1,
+                ds2,
+                added=added,
+                deleted=deleted,
+                modified=modified,
+                unchanged=unchanged,
+                on=["id"],
+            )
+        assert str(exc_info.value) == (
+            "At least one of added, deleted, modified, unchanged flags must be set"
+        )
+        return
+
+    chains = compare(
+        ds1,
+        ds2,
+        added=added,
+        deleted=deleted,
+        modified=modified,
+        unchanged=unchanged,
+        on=["id"],
+    )
+
+    collect_fields = ["id", "name"]
+    if added:
+        assert "diff" not in chains["A"].signals_schema.db_signals()
+        assert list(chains["A"].order_by("id").collect(*collect_fields)) == [(2, "Doe")]
+    if deleted:
+        assert "diff" not in chains["D"].signals_schema.db_signals()
+        assert list(chains["D"].order_by("id").collect(*collect_fields)) == [
+            (3, "Mark")
+        ]
+    if modified:
+        assert "diff" not in chains["M"].signals_schema.db_signals()
+        assert list(chains["M"].order_by("id").collect(*collect_fields)) == [
+            (1, "John1")
+        ]
+    if unchanged:
+        assert "diff" not in chains["U"].signals_schema.db_signals()
+        assert list(chains["U"].order_by("id").collect(*collect_fields)) == [
+            (4, "Andy")
+        ]