implementation of kmp_censor_stream

louisabraham · Sep 25, 2024 · 309b97c · 309b97c
1 parent d8610fc
commit 309b97c
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ A nicer interface to reuse computations lazily is provided in WonderString but c
 - `longest_previous_factor(string, suffix_array=None, lcp=None)`: longest previous factor array (used in the Lempel-Ziv factorization)
 - `lempel_ziv_factorization(lpf, complexity: bool = False)`: Lempel-Ziv factorization
 - `lempel_ziv_complexity(string, suffix_array=None, lcp=None)`: Lempel-Ziv complexity
-
+- `kmp_censor_stream(censor, string)`: Censor a stream (like a generator of string) using the KMP algorithm
 
 ### Example usage
 

diff --git a/pydivsufsort/__init__.py b/pydivsufsort/__init__.py
@@ -1,18 +1,18 @@
-from .divsufsort import divsufsort, bw_transform, inverse_bw_transform, sa_search
+from .divsufsort import bw_transform, divsufsort, inverse_bw_transform, sa_search
 from .stringalg import (
     kasai,
-    lcp_segtree,
+    kmp_censor_stream,
     lcp_query,
+    lcp_segtree,
+    lempel_ziv_complexity,
+    lempel_ziv_factorization,
     levenshtein,
-    most_frequent_substrings,
-    min_rotation,
     longest_previous_factor,
-    lempel_ziv_factorization,
-    lempel_ziv_complexity,
+    min_rotation,
+    most_frequent_substrings,
 )
 from .wonderstring import WonderString, common_substrings
 
-
 __all__ = [
     "divsufsort",
     "bw_transform",
@@ -29,4 +29,5 @@
     "longest_previous_factor",
     "lempel_ziv_factorization",
     "lempel_ziv_complexity",
+    "kmp_censor_stream",
 ]
diff --git a/pydivsufsort/stringalg.pyx b/pydivsufsort/stringalg.pyx
@@ -510,4 +510,58 @@ def lempel_ziv_complexity(s, sa=None, lcp=None):
     if isinstance(s, bytes):
         # tofix
         s = bytearray(s)
-    return _lempel_ziv_complexity(s, sa, lcp)
+    return _lempel_ziv_complexity(s, sa, lcp)
+
+
+cdef vector[ull] _prefix_function(const unsigned char[::1] s):
+    cdef ull n = len(s)
+    cdef vector[ull] pi = vector[ull](n)
+    cdef ull i, j = 0
+    for i in range(1, n):
+        while j > 0 and s[i] != s[j]:
+            j = pi[j - 1]
+        j += s[i] == s[j]
+        pi[i] = j
+    return pi
+
+from libcpp.deque cimport deque
+
+def kmp_censor_stream(censor, stream):
+    """
+    Uses KMP algorithm to censor text from a stream of str.
+    """
+    censor = censor.encode("utf-8")
+    cdef vector[ull] pi = _prefix_function(censor)
+    cdef deque[unsigned char] buffer
+    cdef bytearray out
+    cdef ull j = 0
+    cdef unsigned char c
+    cdef bytes bytes_s
+
+    for s in stream:
+        bytes_s = s.encode("utf-8")
+        out = bytearray()
+
+        for c in bytes_s:
+            buffer.push_back(c)
+            while j > 0 and c != censor[j]:
+                j = pi[j - 1]
+            j += c == censor[j]
+
+            if j == len(censor):
+                buffer.clear()
+                j = pi[j - 1]
+
+            for i in range(int(buffer.size()) - j):
+                out.append(buffer.front())
+                buffer.pop_front()
+
+        if out:
+            yield out.decode("utf-8")
+
+    out = bytearray()
+    while not buffer.empty():
+        out.append(buffer.front())
+        buffer.pop_front()
+    if out:
+        yield out.decode("utf-8")
diff --git a/setup.py b/setup.py
@@ -84,7 +84,7 @@ def read(fname):
 
 setup(
     name="pydivsufsort",
-    version="0.0.17",
+    version="0.0.18",
     author="Louis Abraham",
     license="MIT",
     author_email="[email protected]",

diff --git a/tests/test_correct.py b/tests/test_correct.py
@@ -3,34 +3,29 @@
 
 import numpy as np
 import pytest
+from reference import BWT, all_common_substrings, iBWT, longest_common_prefix
+from reference import min_rotation as min_rotation_ref
+from reference import suffix_array
 
 from pydivsufsort import (
+    bw_transform,
+    common_substrings,
     divsufsort,
+    inverse_bw_transform,
     kasai,
-    lcp_segtree,
+    kmp_censor_stream,
     lcp_query,
-    bw_transform,
-    inverse_bw_transform,
-    sa_search,
+    lcp_segtree,
+    lempel_ziv_complexity,
+    lempel_ziv_factorization,
     levenshtein,
-    most_frequent_substrings,
-    common_substrings,
-    min_rotation,
     longest_previous_factor,
-    lempel_ziv_factorization,
-    lempel_ziv_complexity,
+    min_rotation,
+    most_frequent_substrings,
+    sa_search,
 )
 from pydivsufsort.divsufsort import _SUPPORTED_DTYPES, _minimize_dtype
 
-from reference import (
-    suffix_array,
-    longest_common_prefix,
-    BWT,
-    iBWT,
-    all_common_substrings,
-    min_rotation as min_rotation_ref,
-)
-
 
 def cast_to_array(inp):
     out = array.array("B")
@@ -260,3 +255,16 @@ def test_lz():
     assert lempel_ziv_complexity("") == 0
     assert lempel_ziv_complexity("0001") == 2
     assert lempel_ziv_complexity("010") == 3
+
+
+def test_kmp_censor_stream():
+    assert list(kmp_censor_stream("an", "banana")) == ["b", "a"]
+    assert list(kmp_censor_stream("an", ["ba", "na", "na"])) == ["b", "a"]
+    assert list(kmp_censor_stream("nan", "banana")) == ["b", "a", "a"]
+    assert list(kmp_censor_stream("nan", "bananana")) == ["b", "a", "a"]
+
+    out = list(kmp_censor_stream("a", ["ba", "naba", "na"]))
+    assert len(out) == 3
+    assert "".join(out) == "bnbn"
+
+    out = list(kmp_censor_stream("a", "bonne journée"))