docs: clarify distinct in difference(), add examples

ibis-project · Jan 24, 2025 · 57d50ec · 57d50ec
1 parent 9b44750
commit 57d50ec
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 25 deletions.
diff --git a/ibis/expr/api.py b/ibis/expr/api.py
@@ -2162,7 +2162,11 @@ def difference(table: ir.Table, *rest: ir.Table, distinct: bool = True) -> ir.Ta
     *rest
         Additional table expressions
     distinct
-        Only diff distinct rows not occurring in the calling table
+        Use set difference (True) or multiset difference (False). See examples.
+
+    See Also
+    --------
+    [`ibis.difference`](./expression-tables.qmd#ibis.difference)
 
     Returns
     -------
@@ -2173,35 +2177,50 @@ def difference(table: ir.Table, *rest: ir.Table, distinct: bool = True) -> ir.Ta
     --------
     >>> import ibis
     >>> ibis.options.interactive = True
-    >>> t1 = ibis.memtable({"a": [1, 2]})
-    >>> t1
+    >>> t1 = ibis.memtable({"a": [7, 8, 8, 9, 9, 9]})
+    >>> t2 = ibis.memtable({"a": [8, 9]})
+
+    With distinct=True, if a row ever appears in any of `*rest`,
+    it will not appear in the result.
+    So here, all appearances of 8 and 9 are removed:
+
+    >>> t1.difference(t2)
     ┏━━━━━━━┓
     ┃ a     ┃
     ┡━━━━━━━┩
     │ int64 │
     ├───────┤
-    │     1 │
-    │     2 │
+    │     7 │
     └───────┘
-    >>> t2 = ibis.memtable({"a": [2, 3]})
-    >>> t2
+
+    With `distinct=False`, the algorithm is more of a multiset/bag difference.
+    This means, that since 8 and 9 each appear once in `t1`,
+    the result will be the input with a single instance of each removed:
+
+    >>> t1.difference(t2, distinct=False).order_by("a")
     ┏━━━━━━━┓
     ┃ a     ┃
     ┡━━━━━━━┩
     │ int64 │
     ├───────┤
-    │     2 │
-    │     3 │
+    │     7 │
+    │     8 │
+    │     9 │
+    │     9 │
     └───────┘
-    >>> ibis.difference(t1, t2)
+
+    With multiple tables in `*rest`, we apply the operation consecutively.
+    Here, we we remove two 8s and two 9s:
+
+    >>> t1.difference(t2, t2, distinct=False).order_by("a")
     ┏━━━━━━━┓
     ┃ a     ┃
     ┡━━━━━━━┩
     │ int64 │
     ├───────┤
-    │     1 │
+    │     7 │
+    │     9 │
     └───────┘
-
     """
     return table.difference(*rest, distinct=distinct) if rest else table
 

diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py
@@ -1778,12 +1778,12 @@ def difference(self, table: Table, *rest: Table, distinct: bool = True) -> Table
 
         Parameters
         ----------
-        table:
+        table
             A table expression
-        *rest:
+        *rest
             Additional table expressions
         distinct
-            Only diff distinct rows not occurring in the calling table
+            Use set difference (True) or multiset difference (False). See examples.
 
         See Also
         --------
@@ -1798,33 +1798,49 @@ def difference(self, table: Table, *rest: Table, distinct: bool = True) -> Table
         --------
         >>> import ibis
         >>> ibis.options.interactive = True
-        >>> t1 = ibis.memtable({"a": [1, 2]})
-        >>> t1
+        >>> t1 = ibis.memtable({"a": [7, 8, 8, 9, 9, 9]})
+        >>> t2 = ibis.memtable({"a": [8, 9]})
+
+        With distinct=True, if a row ever appears in any of `*rest`,
+        it will not appear in the result.
+        So here, all appearances of 8 and 9 are removed:
+
+        >>> t1.difference(t2)
         ┏━━━━━━━┓
         ┃ a     ┃
         ┡━━━━━━━┩
         │ int64 │
         ├───────┤
-        │     1 │
-        │     2 │
+        │     7 │
         └───────┘
-        >>> t2 = ibis.memtable({"a": [2, 3]})
-        >>> t2
+
+        With `distinct=False`, the algorithm is more of a multiset/bag difference.
+        This means, that since 8 and 9 each appear once in `t1`,
+        the result will be the input with a single instance of each removed:
+
+        >>> t1.difference(t2, distinct=False).order_by("a")
         ┏━━━━━━━┓
         ┃ a     ┃
         ┡━━━━━━━┩
         │ int64 │
         ├───────┤
-        │     2 │
-        │     3 │
+        │     7 │
+        │     8 │
+        │     9 │
+        │     9 │
         └───────┘
-        >>> t1.difference(t2)
+
+        With multiple tables in `*rest`, we apply the operation consecutively.
+        Here, we we remove two 8s and two 9s:
+
+        >>> t1.difference(t2, t2, distinct=False).order_by("a")
         ┏━━━━━━━┓
         ┃ a     ┃
         ┡━━━━━━━┩
         │ int64 │
         ├───────┤
-        │     1 │
+        │     7 │
+        │     9 │
         └───────┘
         """
         node = ops.Difference(self, table, distinct=distinct)