-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into test-chunking
- Loading branch information
Showing
7 changed files
with
238 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import torch | ||
import torch.nn as nn | ||
from transformers import AutoModel | ||
|
||
from typing import List, Union, Optional | ||
|
||
|
||
class JinaEmbeddingsV3Wrapper(nn.Module): | ||
def __init__( | ||
self, model_name, tasks=['retrieval.query', 'retrieval.passage'], **model_kwargs | ||
): | ||
super().__init__() | ||
self._model = AutoModel.from_pretrained( | ||
model_name, trust_remote_code=True, **model_kwargs | ||
) | ||
self.tasks = tasks | ||
|
||
def encode_queries( | ||
self, | ||
sentences: Union[str, List[str]], | ||
*args, | ||
task: Optional[str] = None, | ||
**kwargs, | ||
): | ||
return self._model.encode(sentences, *args, task=self.tasks[0], **kwargs) | ||
|
||
def encode_corpus( | ||
self, | ||
sentences: Union[str, List[str]], | ||
*args, | ||
**kwargs, | ||
): | ||
_sentences = [self._construct_document(sentence) for sentence in sentences] | ||
return self._model.encode(_sentences, *args, task=self.tasks[1], **kwargs) | ||
|
||
def get_instructions(self): | ||
return [self._model._task_instructions[x] for x in self.tasks] | ||
|
||
def forward(self, *args, **kwargs): | ||
task_id = self._model._adaptation_map[self.tasks[1]] | ||
num_examples = kwargs['input_ids'].shape[0] | ||
adapter_mask = torch.full( | ||
(num_examples,), task_id, dtype=torch.int32, device=self._model.device | ||
) | ||
return self._model.forward(*args, adapter_mask=adapter_mask, **kwargs) | ||
|
||
def _construct_document(self, doc): | ||
if isinstance(doc, str): | ||
return doc | ||
elif 'title' in doc: | ||
return f'{doc["title"]} {doc["text"].strip()}' | ||
else: | ||
return doc['text'].strip() | ||
|
||
@property | ||
def device(self): | ||
return self._model.device | ||
|
||
@staticmethod | ||
def has_instructions(): | ||
return True | ||
|
||
|
||
MODEL_WRAPPERS = {'jinaai/jina-embeddings-v3': JinaEmbeddingsV3Wrapper} | ||
MODELS_WITHOUT_PROMPT_NAME_ARG = [ | ||
'jinaai/jina-embeddings-v2-small-en', | ||
'jinaai/jina-embeddings-v2-base-en', | ||
'jinaai/jina-embeddings-v3', | ||
] | ||
|
||
|
||
def remove_unsupported_kwargs(original_encode): | ||
def wrapper(self, *args, **kwargs): | ||
# Remove 'prompt_name' from kwargs if present | ||
kwargs.pop('prompt_name', None) | ||
kwargs.pop('request_qid', None) | ||
return original_encode(self, *args, **kwargs) | ||
|
||
return wrapper | ||
|
||
|
||
def load_model(model_name, **model_kwargs): | ||
if model_name in MODEL_WRAPPERS: | ||
model = MODEL_WRAPPERS[model_name](model_name, **model_kwargs) | ||
has_instructions = MODEL_WRAPPERS[model_name].has_instructions() | ||
else: | ||
model = AutoModel.from_pretrained(model_name, trust_remote_code=True) | ||
has_instructions = False | ||
|
||
# encode functions of various models do not support all sentence transformers kwargs parameter | ||
if model_name in MODELS_WITHOUT_PROMPT_NAME_ARG: | ||
ENCODE_FUNC_NAMES = ['encode', 'encode_queries', 'encode_corpus'] | ||
for func_name in ENCODE_FUNC_NAMES: | ||
if hasattr(model, func_name): | ||
setattr( | ||
model, | ||
func_name, | ||
remove_unsupported_kwargs(getattr(model, func_name)), | ||
) | ||
|
||
return model, has_instructions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import pytest | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from chunked_pooling.mteb_chunked_eval import AbsTaskChunkedRetrieval | ||
|
||
|
||
class DummyTask(AbsTaskChunkedRetrieval): | ||
metadata = TaskMetadata( | ||
dataset={ | ||
'path': '~', | ||
'revision': '', | ||
}, | ||
name='dummy', | ||
description='', | ||
type='Retrieval', | ||
category='s2p', | ||
reference=None, | ||
eval_splits=[], | ||
eval_langs=[], | ||
main_score='ndcg_at_10', | ||
date=None, | ||
form=None, | ||
domains=None, | ||
task_subtypes=None, | ||
license=None, | ||
socioeconomic_status=None, | ||
annotations_creators=None, | ||
dialect=None, | ||
text_creation=None, | ||
bibtex_citation=None, | ||
n_samples=None, | ||
avg_character_length=None, | ||
) | ||
|
||
def load_data(): | ||
pass | ||
|
||
def __init__(self, **kwargs): | ||
super().__init__(**kwargs) | ||
|
||
|
||
@pytest.fixture() | ||
def dummy_task_factory(): | ||
def _create_dummy_task(*args, **kwargs): | ||
return DummyTask(*args, **kwargs) | ||
|
||
return _create_dummy_task |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from transformers import AutoTokenizer | ||
|
||
from run_chunked_eval import load_model, DEFAULT_CHUNK_SIZE | ||
|
||
MODEL_NAME = 'jinaai/jina-embeddings-v3' | ||
|
||
|
||
def test_instruction_handling(dummy_task_factory): | ||
model, has_instructions = load_model(MODEL_NAME) | ||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) | ||
task = dummy_task_factory( | ||
chunking_strategy='fixed', | ||
chunk_size=DEFAULT_CHUNK_SIZE, | ||
tokenizer=tokenizer, | ||
model_has_instructions=has_instructions, | ||
) | ||
n_instruction_tokens = len( | ||
tokenizer(model.get_instructions()[1], add_special_tokens=False)['input_ids'] | ||
) | ||
annotations_one_token = task._calculate_annotations(model, ['A'])[0] | ||
assert len(annotations_one_token) == 1 | ||
assert annotations_one_token[0] == (0, n_instruction_tokens + 3) |