From 8fd8d2b9f1ef32bcc096a4ddd986a2233d073baf Mon Sep 17 00:00:00 2001 From: Baptiste Roziere Date: Fri, 20 Dec 2024 11:08:50 +0000 Subject: [PATCH] quick fix middle token --- src/mistral_common/tokens/tokenizers/sentencepiece.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mistral_common/tokens/tokenizers/sentencepiece.py b/src/mistral_common/tokens/tokenizers/sentencepiece.py index 6b92411..4dd0436 100644 --- a/src/mistral_common/tokens/tokenizers/sentencepiece.py +++ b/src/mistral_common/tokens/tokenizers/sentencepiece.py @@ -332,6 +332,7 @@ def __init__(self, tokenizer: Tokenizer, mm_encoder: Optional[MultiModalEncoder] self.BOS = self.tokenizer.get_control_token(SpecialTokens.bos.value) self.PREFIX = self.tokenizer.get_control_token(SpecialTokens.prefix.value) self.SUFFIX = self.tokenizer.get_control_token(SpecialTokens.suffix.value) + self.MIDDLE = self.tokenizer.get_control_token(SpecialTokens.middle.value) def encode_user_message( self, @@ -453,6 +454,7 @@ def encode_fim(self, request: FIMRequest) -> Tokenized: *suffix_tokens, self.PREFIX, *prefix_tokens, + self.MIDDLE, ] return Tokenized(tokens=tokens, text=self.tokenizer.to_string(tokens))