fix: gaps in semantic chunking

jina-ai · Sep 23, 2024 · 2351110 · 2351110
1 parent 9e23fd6
commit 2351110
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 7 deletions.
diff --git a/chunked_pooling/chunking.py b/chunked_pooling/chunking.py
@@ -31,6 +31,7 @@ def _setup_semantic_chunking(self, embedding_model_name):
         self.embed_model = HuggingFaceEmbedding(
             model_name=self.embedding_model_name,
             trust_remote_code=True,
+            embed_batch_size=1,
         )
         self.splitter = SemanticSplitterNodeParser(
             embed_model=self.embed_model,
@@ -71,13 +72,12 @@ def chunk_semantically(
             start_chunk_index = bisect.bisect_left(
                 [offset[0] for offset in token_offsets], char_start
             )
-            end_chunk_index = (
-                bisect.bisect_right([offset[1] for offset in token_offsets], char_end)
-                - 1
+            end_chunk_index = bisect.bisect_right(
+                [offset[1] for offset in token_offsets], char_end
             )
 
             # Add the chunk span if it's within the tokenized text
-            if start_chunk_index < len(token_offsets) and end_chunk_index < len(
+            if start_chunk_index < len(token_offsets) and end_chunk_index <= len(
                 token_offsets
             ):
                 chunk_spans.append((start_chunk_index, end_chunk_index))

diff --git a/tests/test_chunking_methods.py b/tests/test_chunking_methods.py
@@ -100,14 +100,36 @@ def test_chunk_by_tokens():
 
 def test_chunk_semantically():
     chunker = Chunker(chunking_strategy="semantic")
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-    chunks = chunker.chunk(
+    tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en')
+    tokens = tokenizer.encode_plus(
+        EXAMPLE_TEXT_1, add_special_tokens=False, return_offsets_mapping=True
+    )
+    boundary_cues = chunker.chunk(
         EXAMPLE_TEXT_1,
         tokenizer=tokenizer,
         chunking_strategy='semantic',
         embedding_model_name='jinaai/jina-embeddings-v2-small-en',
     )
-    assert len(chunks) > 0
+
+    # check if it returns boundary cues
+    assert len(boundary_cues) > 0
+
+    # test if bounaries are at the end of sentences
+    for start_token_idx, end_token_idx in boundary_cues:
+        assert (
+            EXAMPLE_TEXT_1[tokens.offset_mapping[end_token_idx - 1][0]] in PUNCTATIONS
+        )
+        decoded_text_chunk = tokenizer.decode(
+            tokens.input_ids[start_token_idx:end_token_idx]
+        )
+
+    # check that the boundary cues are continuous (no token is missing)
+    assert all(
+        [
+            boundary_cues[i][1] == boundary_cues[i + 1][0]
+            for i in range(len(boundary_cues) - 1)
+        ]
+    )
 
 
 def test_empty_input():