Skip to content

Commit

Permalink
Merge pull request #18 from xingchensong/Mddct-speechtokenizer-v2
Browse files Browse the repository at this point in the history
[v2-tokenizer] support cosyvoice-tokenizer-v2
  • Loading branch information
xingchensong authored Dec 22, 2024
2 parents c14c2aa + 1f72c8f commit 0bce299
Show file tree
Hide file tree
Showing 6 changed files with 472 additions and 31 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pip install s3tokenizer
```py
import s3tokenizer

tokenizer = s3tokenizer.load_model("speech_tokenizer_v1").cuda() # or "speech_tokenizer_v1_25hz"
tokenizer = s3tokenizer.load_model("speech_tokenizer_v1").cuda() # or "speech_tokenizer_v1_25hz speech_tokenizer_v2_25hz"

mels = []
wav_paths = ["s3tokenizer/assets/BAC009S0764W0121.wav", "s3tokenizer/assets/BAC009S0764W0122.wav"]
Expand All @@ -48,7 +48,7 @@ s3tokenizer --wav_scp xxx.scp \
--device "cpu" \
--output_dir "./" \
--batch_size 32 \
--model "speech_tokenizer_v1" # or "speech_tokenizer_v1_25hz"
--model "speech_tokenizer_v1" # or "speech_tokenizer_v1_25hz speech_tokenizer_v2_25hz"
```


Expand All @@ -66,7 +66,7 @@ torchrun --nproc_per_node=8 --nnodes=1 \
--device "cuda" \
--output_dir "./" \
--batch_size 32 \
--model "speech_tokenizer_v1" # or "speech_tokenizer_v1_25hz"
--model "speech_tokenizer_v1" # or "speech_tokenizer_v1_25hz speech_tokenizer_v2_25hz"
```


Expand Down
13 changes: 11 additions & 2 deletions s3tokenizer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

from tqdm import tqdm

from s3tokenizer.model_v2 import S3TokenizerV2

from .model import S3Tokenizer
from .utils import (load_audio, log_mel_spectrogram, make_non_pad_mask,
mask_to_bias, onnx2torch, padding)
Expand All @@ -39,13 +41,18 @@
"speech_tokenizer_v1_25hz":
"https://www.modelscope.cn/models/iic/CosyVoice-300M-25Hz/"
"resolve/master/speech_tokenizer_v1.onnx",
"speech_tokenizer_v2_25hz":
"https://www.modelscope.cn/models/iic/CosyVoice2-0.5B/"
"resolve/master/speech_tokenizer_v2.onnx",
}

_SHA256S = {
"speech_tokenizer_v1":
"23b5a723ed9143aebfd9ffda14ac4c21231f31c35ef837b6a13bb9e5488abb1e",
"speech_tokenizer_v1_25hz":
"56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486",
"speech_tokenizer_v2_25hz":
"d43342aa12163a80bf07bffb94c9de2e120a8df2f9917cd2f642e7f4219c6f71",
}


Expand Down Expand Up @@ -137,8 +144,10 @@ def load_model(
else:
raise RuntimeError(
f"Model {name} not found; available models = {available_models()}")

model = S3Tokenizer(name)
if 'v2' in name:
model = S3TokenizerV2(name)
else:
model = S3Tokenizer(name)
model.init_from_onnx(checkpoint_file)

return model
14 changes: 8 additions & 6 deletions s3tokenizer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,14 @@ def init_distributed():

def get_args():
parser = argparse.ArgumentParser(description='extract speech code')
parser.add_argument(
'--model',
required=True,
type=str,
choices=["speech_tokenizer_v1", "speech_tokenizer_v1_25hz"],
help='model version')
parser.add_argument('--model',
required=True,
type=str,
choices=[
"speech_tokenizer_v1", "speech_tokenizer_v1_25hz",
"speech_tokenizer_v2_25hz"
],
help='model version')
parser.add_argument('--wav_scp',
required=True,
type=str,
Expand Down
Loading

0 comments on commit 0bce299

Please sign in to comment.