Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add ngrok proxy to run server #82

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
*venv
*.DS_Store
*.idea/
test*
test*
.vscode
18 changes: 17 additions & 1 deletion model_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,31 @@ async def generate_stream(request: Request):
return StreamingResponse(generator)


def ngrok_proxy(port):
"""
run `ngrok config add-authtoken $NGROK_TOKEN`
"""
from pyngrok import ngrok
import nest_asyncio

ngrok_tunnel = ngrok.connect(port)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--dtype", type=str, default="bfloat16")
parser.add_argument("--device", type=str, default="cuda:0")
parser.add_argument("--port", type=int, default=10000)
parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b")
parser.add_argument("--ngrok", action='store_true', help="use ngrok proxy")

args = parser.parse_args()
if args.ngrok:
ngrok_proxy(args.port)

worker = ModelWorker(args.model_path, args.dtype, args.device)
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ wget==3.2
WeTextProcessing==1.0.3
torch==2.3.0
torchaudio==2.3.0
pyngrok==7.2.1
nest_asyncio==1.6.0
bitsandbytes==0.44.1
108 changes: 81 additions & 27 deletions web_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,45 @@
import torchaudio
from transformers import WhisperFeatureExtractor, AutoTokenizer
from speech_tokenizer.modeling_whisper import WhisperVQEncoder

from audio_process import AudioStreamProcessor
from flow_inference import AudioDecoder
import torch
import gradio as gr
from speech_tokenizer.utils import extract_speech_token

sys.path.insert(0, "./cosyvoice")
sys.path.insert(0, "./third_party/Matcha-TTS")

from speech_tokenizer.utils import extract_speech_token
audio_token_pattern = re.compile(r"<\|audio_(\d+)\|>")

import gradio as gr
import torch

audio_token_pattern = re.compile(r"<\|audio_(\d+)\|>")
def ngrok_proxy(port):
"""
run `ngrok config add-authtoken $NGROK_TOKEN`
"""
from pyngrok import ngrok
import nest_asyncio

ngrok_tunnel = ngrok.connect(port)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()

from flow_inference import AudioDecoder
from audio_process import AudioStreamProcessor

if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--port", type=int, default="8888")
parser.add_argument("--flow-path", type=str, default="./glm-4-voice-decoder")
parser.add_argument("--model-path", type=str, default="THUDM/glm-4-voice-9b")
parser.add_argument("--tokenizer-path", type= str, default="THUDM/glm-4-voice-tokenizer")
parser.add_argument("--tokenizer-path", type=str, default="THUDM/glm-4-voice-tokenizer")
parser.add_argument("--server-addr", type=str,
default="http://localhost:10000", help="server address")
parser.add_argument("--ngrok", action='store_true', help="use ngrok proxy")
args = parser.parse_args()

if args.ngrok:
ngrok_proxy(args.port)

flow_config = os.path.join(args.flow_path, "config.yaml")
flow_checkpoint = os.path.join(args.flow_path, 'flow.pt')
hift_checkpoint = os.path.join(args.flow_path, 'hift.pt')
Expand All @@ -42,7 +57,6 @@
audio_decoder: AudioDecoder = None
whisper_model, feature_extractor = None, None


def initialize_fn():
global audio_decoder, feature_extractor, whisper_model, glm_model, glm_tokenizer
if audio_decoder is not None:
Expand All @@ -60,11 +74,9 @@ def initialize_fn():
whisper_model = WhisperVQEncoder.from_pretrained(args.tokenizer_path).eval().to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(args.tokenizer_path)


def clear_fn():
return [], [], '', '', '', None, None


def inference_fn(
temperature: float,
top_p: float,
Expand Down Expand Up @@ -96,7 +108,6 @@ def inference_fn(
user_input = input_text
system_prompt = "User will provide you with a text instruction. Do it step by step. First, think about the instruction and respond in a interleaved manner, with 13 text token followed by 26 audio tokens."


# Gather history
inputs = previous_input_tokens + previous_completion_tokens
inputs = inputs.strip()
Expand All @@ -106,7 +117,7 @@ def inference_fn(

with torch.no_grad():
response = requests.post(
"http://localhost:10000/generate_stream",
f"{args.server_addr}/generate_stream",
data=json.dumps({
"prompt": inputs,
"temperature": temperature,
Expand All @@ -126,7 +137,7 @@ def inference_fn(
tts_mels = []
prev_mel = None
is_finalize = False
block_size_list = [25,50,100,150,200]
block_size_list = [25, 50, 100, 150, 200]
block_size_idx = 0
block_size = block_size_list[block_size_idx]
audio_processor = AudioStreamProcessor()
Expand All @@ -149,13 +160,15 @@ def inference_fn(
finalize=is_finalize)
prev_mel = tts_mel

audio_bytes = audio_processor.process(tts_speech.clone().cpu().numpy()[0], last=is_finalize)
audio_bytes = audio_processor.process(
tts_speech.clone().cpu().numpy()[0], last=is_finalize)

tts_speechs.append(tts_speech.squeeze())
tts_mels.append(tts_mel)
if audio_bytes:
yield history, inputs, '', '', audio_bytes, None
flow_prompt_speech_token = torch.cat((flow_prompt_speech_token, tts_token), dim=-1)
flow_prompt_speech_token = torch.cat(
(flow_prompt_speech_token, tts_token), dim=-1)
audio_tokens = []
if not is_finalize:
complete_tokens.append(token_id)
Expand All @@ -168,17 +181,16 @@ def inference_fn(
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
torchaudio.save(f, tts_speech.unsqueeze(0), 22050, format="wav")
history.append({"role": "assistant", "content": {"path": f.name, "type": "audio/wav"}})
history.append({"role": "assistant", "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False)})
history.append({"role": "assistant", "content": glm_tokenizer.decode(
text_tokens, ignore_special_tokens=False)})
yield history, inputs, complete_text, '', None, (22050, tts_speech.numpy())


def update_input_interface(input_mode):
if input_mode == "audio":
return [gr.update(visible=True), gr.update(visible=False)]
else:
return [gr.update(visible=False), gr.update(visible=True)]


# Create the Gradio interface
with gr.Blocks(title="GLM-4-Voice Demo", fill_height=True) as demo:
with gr.Row():
Expand Down Expand Up @@ -207,17 +219,25 @@ def update_input_interface(input_mode):
with gr.Row():
with gr.Column():
input_mode = gr.Radio(["audio", "text"], label="Input Mode", value="audio")
audio = gr.Audio(label="Input audio", type='filepath', show_download_button=True, visible=True)
text_input = gr.Textbox(label="Input text", placeholder="Enter your text here...", lines=2, visible=False)
audio = gr.Audio(
label="Input audio",
type='filepath',
show_download_button=True,
visible=True)
text_input = gr.Textbox(
label="Input text",
placeholder="Enter your text here...",
lines=2,
visible=False)

with gr.Column():
submit_btn = gr.Button("Submit")
reset_btn = gr.Button("Clear")
output_audio = gr.Audio(label="Play", streaming=True,
autoplay=True, show_download_button=False)
complete_audio = gr.Audio(label="Last Output Audio (If Any)", show_download_button=True)


complete_audio = gr.Audio(
label="Last Output Audio (If Any)",
show_download_button=True)

gr.Markdown("""## Debug Info""")
with gr.Row():
Expand Down Expand Up @@ -251,13 +271,47 @@ def update_input_interface(input_mode):
input_tokens,
completion_tokens,
],
outputs=[history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio]
outputs=[
history_state,
input_tokens,
completion_tokens,
detailed_error,
output_audio,
complete_audio
]
)

respond.then(lambda s: s, [history_state], chatbot)

reset_btn.click(clear_fn, outputs=[chatbot, history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio])
input_mode.input(clear_fn, outputs=[chatbot, history_state, input_tokens, completion_tokens, detailed_error, output_audio, complete_audio]).then(update_input_interface, inputs=[input_mode], outputs=[audio, text_input])
reset_btn.click(
clear_fn,
outputs=[
chatbot,
history_state,
input_tokens,
completion_tokens,
detailed_error,
output_audio,
complete_audio
]
)
input_mode.input(
clear_fn,
outputs=[
chatbot,
history_state,
input_tokens,
completion_tokens,
detailed_error,
output_audio,
complete_audio]).then(
update_input_interface,
inputs=[input_mode],
outputs=[
audio,
text_input
]
)

initialize_fn()
# Launch the interface
Expand Down