Skip to content

Commit

Permalink
fix: gaps in semantic chunking
Browse files Browse the repository at this point in the history
  • Loading branch information
guenthermi committed Sep 23, 2024
1 parent 9e23fd6 commit 2351110
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 7 deletions.
8 changes: 4 additions & 4 deletions chunked_pooling/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def _setup_semantic_chunking(self, embedding_model_name):
self.embed_model = HuggingFaceEmbedding(
model_name=self.embedding_model_name,
trust_remote_code=True,
embed_batch_size=1,
)
self.splitter = SemanticSplitterNodeParser(
embed_model=self.embed_model,
Expand Down Expand Up @@ -71,13 +72,12 @@ def chunk_semantically(
start_chunk_index = bisect.bisect_left(
[offset[0] for offset in token_offsets], char_start
)
end_chunk_index = (
bisect.bisect_right([offset[1] for offset in token_offsets], char_end)
- 1
end_chunk_index = bisect.bisect_right(
[offset[1] for offset in token_offsets], char_end
)

# Add the chunk span if it's within the tokenized text
if start_chunk_index < len(token_offsets) and end_chunk_index < len(
if start_chunk_index < len(token_offsets) and end_chunk_index <= len(
token_offsets
):
chunk_spans.append((start_chunk_index, end_chunk_index))
Expand Down
28 changes: 25 additions & 3 deletions tests/test_chunking_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,14 +100,36 @@ def test_chunk_by_tokens():

def test_chunk_semantically():
chunker = Chunker(chunking_strategy="semantic")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
chunks = chunker.chunk(
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en')
tokens = tokenizer.encode_plus(
EXAMPLE_TEXT_1, add_special_tokens=False, return_offsets_mapping=True
)
boundary_cues = chunker.chunk(
EXAMPLE_TEXT_1,
tokenizer=tokenizer,
chunking_strategy='semantic',
embedding_model_name='jinaai/jina-embeddings-v2-small-en',
)
assert len(chunks) > 0

# check if it returns boundary cues
assert len(boundary_cues) > 0

# test if bounaries are at the end of sentences
for start_token_idx, end_token_idx in boundary_cues:
assert (
EXAMPLE_TEXT_1[tokens.offset_mapping[end_token_idx - 1][0]] in PUNCTATIONS
)
decoded_text_chunk = tokenizer.decode(
tokens.input_ids[start_token_idx:end_token_idx]
)

# check that the boundary cues are continuous (no token is missing)
assert all(
[
boundary_cues[i][1] == boundary_cues[i + 1][0]
for i in range(len(boundary_cues) - 1)
]
)


def test_empty_input():
Expand Down

0 comments on commit 2351110

Please sign in to comment.