Skip to content

Commit

Permalink
implementation of kmp_censor_stream
Browse files Browse the repository at this point in the history
  • Loading branch information
louisabraham committed Sep 25, 2024
1 parent d8610fc commit 309b97c
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 28 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ A nicer interface to reuse computations lazily is provided in WonderString but c
- `longest_previous_factor(string, suffix_array=None, lcp=None)`: longest previous factor array (used in the Lempel-Ziv factorization)
- `lempel_ziv_factorization(lpf, complexity: bool = False)`: Lempel-Ziv factorization
- `lempel_ziv_complexity(string, suffix_array=None, lcp=None)`: Lempel-Ziv complexity

- `kmp_censor_stream(censor, string)`: Censor a stream (like a generator of string) using the KMP algorithm

### Example usage

Expand Down
15 changes: 8 additions & 7 deletions pydivsufsort/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from .divsufsort import divsufsort, bw_transform, inverse_bw_transform, sa_search
from .divsufsort import bw_transform, divsufsort, inverse_bw_transform, sa_search
from .stringalg import (
kasai,
lcp_segtree,
kmp_censor_stream,
lcp_query,
lcp_segtree,
lempel_ziv_complexity,
lempel_ziv_factorization,
levenshtein,
most_frequent_substrings,
min_rotation,
longest_previous_factor,
lempel_ziv_factorization,
lempel_ziv_complexity,
min_rotation,
most_frequent_substrings,
)
from .wonderstring import WonderString, common_substrings


__all__ = [
"divsufsort",
"bw_transform",
Expand All @@ -29,4 +29,5 @@
"longest_previous_factor",
"lempel_ziv_factorization",
"lempel_ziv_complexity",
"kmp_censor_stream",
]
56 changes: 55 additions & 1 deletion pydivsufsort/stringalg.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -510,4 +510,58 @@ def lempel_ziv_complexity(s, sa=None, lcp=None):
if isinstance(s, bytes):
# tofix
s = bytearray(s)
return _lempel_ziv_complexity(s, sa, lcp)
return _lempel_ziv_complexity(s, sa, lcp)


cdef vector[ull] _prefix_function(const unsigned char[::1] s):
cdef ull n = len(s)
cdef vector[ull] pi = vector[ull](n)
cdef ull i, j = 0
for i in range(1, n):
while j > 0 and s[i] != s[j]:
j = pi[j - 1]
j += s[i] == s[j]
pi[i] = j
return pi

from libcpp.deque cimport deque

def kmp_censor_stream(censor, stream):
"""
Uses KMP algorithm to censor text from a stream of str.
"""
censor = censor.encode("utf-8")
cdef vector[ull] pi = _prefix_function(censor)
cdef deque[unsigned char] buffer
cdef bytearray out
cdef ull j = 0
cdef unsigned char c
cdef bytes bytes_s

for s in stream:
bytes_s = s.encode("utf-8")
out = bytearray()

for c in bytes_s:
buffer.push_back(c)
while j > 0 and c != censor[j]:
j = pi[j - 1]
j += c == censor[j]

if j == len(censor):
buffer.clear()
j = pi[j - 1]

for i in range(int(buffer.size()) - j):
out.append(buffer.front())
buffer.pop_front()

if out:
yield out.decode("utf-8")

out = bytearray()
while not buffer.empty():
out.append(buffer.front())
buffer.pop_front()
if out:
yield out.decode("utf-8")
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def read(fname):

setup(
name="pydivsufsort",
version="0.0.17",
version="0.0.18",
author="Louis Abraham",
license="MIT",
author_email="[email protected]",
Expand Down
44 changes: 26 additions & 18 deletions tests/test_correct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,29 @@

import numpy as np
import pytest
from reference import BWT, all_common_substrings, iBWT, longest_common_prefix
from reference import min_rotation as min_rotation_ref
from reference import suffix_array

from pydivsufsort import (
bw_transform,
common_substrings,
divsufsort,
inverse_bw_transform,
kasai,
lcp_segtree,
kmp_censor_stream,
lcp_query,
bw_transform,
inverse_bw_transform,
sa_search,
lcp_segtree,
lempel_ziv_complexity,
lempel_ziv_factorization,
levenshtein,
most_frequent_substrings,
common_substrings,
min_rotation,
longest_previous_factor,
lempel_ziv_factorization,
lempel_ziv_complexity,
min_rotation,
most_frequent_substrings,
sa_search,
)
from pydivsufsort.divsufsort import _SUPPORTED_DTYPES, _minimize_dtype

from reference import (
suffix_array,
longest_common_prefix,
BWT,
iBWT,
all_common_substrings,
min_rotation as min_rotation_ref,
)


def cast_to_array(inp):
out = array.array("B")
Expand Down Expand Up @@ -260,3 +255,16 @@ def test_lz():
assert lempel_ziv_complexity("") == 0
assert lempel_ziv_complexity("0001") == 2
assert lempel_ziv_complexity("010") == 3


def test_kmp_censor_stream():
assert list(kmp_censor_stream("an", "banana")) == ["b", "a"]
assert list(kmp_censor_stream("an", ["ba", "na", "na"])) == ["b", "a"]
assert list(kmp_censor_stream("nan", "banana")) == ["b", "a", "a"]
assert list(kmp_censor_stream("nan", "bananana")) == ["b", "a", "a"]

out = list(kmp_censor_stream("a", ["ba", "naba", "na"]))
assert len(out) == 3
assert "".join(out) == "bnbn"

out = list(kmp_censor_stream("a", "bonne journée"))

0 comments on commit 309b97c

Please sign in to comment.