Skip to content

Commit

Permalink
Fix split on special tokens & bump version
Browse files Browse the repository at this point in the history
  • Loading branch information
n1t0 committed Jan 12, 2020
1 parent 32d3955 commit fc9e81d
Show file tree
Hide file tree
Showing 6 changed files with 32 additions and 14 deletions.
6 changes: 3 additions & 3 deletions bindings/python/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tokenizers-python"
version = "0.1.0"
version = "0.1.1"
authors = ["Anthony MOI <[email protected]>"]
edition = "2018"

Expand Down
2 changes: 1 addition & 1 deletion bindings/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

setup(
name="tokenizers",
version="0.1.0",
version="0.1.1",
description="Fast and Customizable Tokenizers",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/tokenizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1.0"
__version__ = "0.1.1"

from .tokenizers import Tokenizer, Encoding
from .tokenizers import decoders
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
authors = ["Anthony MOI <[email protected]>"]
edition = "2018"
name = "tokenizers"
version = "0.6.0"
version = "0.6.1"
homepage = "https://github.com/huggingface/tokenizers"
repository = "https://github.com/huggingface/tokenizers"
documentation = "https://docs.rs/tokenizers/"
Expand Down
32 changes: 25 additions & 7 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,8 @@ impl Tokenizer {
}
}

self.refresh_added_tokens();

added
}

Expand All @@ -591,11 +593,27 @@ impl Tokenizer {
.or_insert_with(|| token.clone());
}

self.refresh_added_tokens();

// Return the number of added tokens
tokens.len() - ignored
}

fn refresh_added_tokens(&mut self) {
// We rebuild the regex here everytime on purpose, because the added tokens may
// have changed
let special_tokens = self
.special_tokens
.keys()
.map(|t| AddedToken {
content: t.to_owned(),
single_word: true,
})
.collect::<Vec<_>>();
let added_tokens = self
.added_tokens
.keys()
.chain(special_tokens.iter())
.map(|token| {
if token.single_word {
let first_b = token
Expand Down Expand Up @@ -635,9 +653,6 @@ impl Tokenizer {
self.split_re =
Some(regex::Regex::new(&format!(r"({})", added_tokens.join("|"))).unwrap());
}

// Return the number of added tokens
tokens.len() - ignored
}

/// Split the given sentence on multiple parts, finding the added tokens and their id in the process
Expand Down Expand Up @@ -677,10 +692,13 @@ impl Tokenizer {
.into_iter()
.map(|(start, end)| unsafe {
let s = sentence.get_unchecked(start..end).to_owned();
let id = self.added_tokens.get(&AddedToken {
content: s.clone(),
..Default::default()
});
let mut id = self.special_tokens.get(&s);
if id.is_none() {
id = self.added_tokens.get(&AddedToken {
content: s.clone(),
..Default::default()
});
}
(s, id.copied())
})
.collect()
Expand Down

0 comments on commit fc9e81d

Please sign in to comment.