Skip to content

Commit

Permalink
Use the whisper-v3 tokenizer now that it has been added. (#1337)
Browse files Browse the repository at this point in the history
* Use the whisper-v3 tokenizer now that it has been added.

* Use the appropriate nospeech token.
  • Loading branch information
LaurentMazare authored Nov 16, 2023
1 parent a1f41ab commit 9ab3f97
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 8 deletions.
14 changes: 8 additions & 6 deletions candle-examples/examples/whisper/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,13 @@ impl Decoder {
let transcribe_token = token_id(&tokenizer, m::TRANSCRIBE_TOKEN)?;
let translate_token = token_id(&tokenizer, m::TRANSLATE_TOKEN)?;
let eot_token = token_id(&tokenizer, m::EOT_TOKEN)?;
let no_speech_token = token_id(&tokenizer, m::NO_SPEECH_TOKEN)?;
let no_speech_token = m::NO_SPEECH_TOKENS
.iter()
.find_map(|token| token_id(&tokenizer, token).ok());
let no_speech_token = match no_speech_token {
None => anyhow::bail!("unable to find any non-speech token"),
Some(n) => n,
};
Ok(Self {
model,
rng: rand::rngs::StdRng::seed_from_u64(seed),
Expand Down Expand Up @@ -512,11 +518,7 @@ fn main() -> Result<()> {
)
} else {
let config = repo.get("config.json")?;
let tokenizer = if args.model == WhichModel::LargeV3 {
panic!("openai/whisper-large-v3 does not provide a compatible tokenizer.json config at the moment")
} else {
repo.get("tokenizer.json")?
};
let tokenizer = repo.get("tokenizer.json")?;
let model = repo.get("model.safetensors")?;
(config, tokenizer, model)
};
Expand Down
2 changes: 1 addition & 1 deletion candle-transformers/src/models/whisper/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ pub const TRANSCRIBE_TOKEN: &str = "<|transcribe|>";
pub const TRANSLATE_TOKEN: &str = "<|translate|>";
pub const NO_TIMESTAMPS_TOKEN: &str = "<|notimestamps|>";
pub const EOT_TOKEN: &str = "<|endoftext|>";
pub const NO_SPEECH_TOKEN: &str = "<|nocaptions|>";
pub const NO_SPEECH_TOKENS: [&str; 2] = ["<|nocaptions|>", "<|nospeech|>"];
8 changes: 7 additions & 1 deletion candle-wasm-examples/whisper/src/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,13 @@ impl Decoder {
let transcribe_token = token_id(&tokenizer, m::TRANSCRIBE_TOKEN)?;
let translate_token = token_id(&tokenizer, m::TRANSLATE_TOKEN)?;
let eot_token = token_id(&tokenizer, m::EOT_TOKEN)?;
let no_speech_token = token_id(&tokenizer, m::NO_SPEECH_TOKEN)?;
let no_speech_token = m::NO_SPEECH_TOKENS
.iter()
.find_map(|token| token_id(&tokenizer, token).ok());
let no_speech_token = match no_speech_token {
None => anyhow::bail!("unable to find any non-speech token"),
Some(n) => n,
};
let seed = 299792458;
Ok(Self {
model,
Expand Down

0 comments on commit 9ab3f97

Please sign in to comment.