From b486189b0612074c553ccad5d2444b3115edeaac Mon Sep 17 00:00:00 2001 From: Michael Date: Mon, 23 Sep 2024 16:56:02 +0200 Subject: [PATCH] refactor: add second model to semantic chunking test --- tests/test_chunking_methods.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_chunking_methods.py b/tests/test_chunking_methods.py index ff21fc5..02c3e17 100644 --- a/tests/test_chunking_methods.py +++ b/tests/test_chunking_methods.py @@ -98,9 +98,13 @@ def test_chunk_by_tokens(): assert end - start <= 10 -def test_chunk_semantically(): +@pytest.mark.parametrize( + 'model_name', + ['jinaai/jina-embeddings-v2-small-en', 'sentence-transformers/all-MiniLM-L6-v2'], +) +def test_chunk_semantically(model_name): chunker = Chunker(chunking_strategy="semantic") - tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-small-en') + tokenizer = AutoTokenizer.from_pretrained(model_name) tokens = tokenizer.encode_plus( EXAMPLE_TEXT_1, add_special_tokens=False, return_offsets_mapping=True ) @@ -108,7 +112,7 @@ def test_chunk_semantically(): EXAMPLE_TEXT_1, tokenizer=tokenizer, chunking_strategy='semantic', - embedding_model_name='jinaai/jina-embeddings-v2-small-en', + embedding_model_name=model_name, ) # check if it returns boundary cues