diff --git a/.github/actions/build-plugin/action.yaml b/.github/actions/build-plugin/action.yaml index 03e0e20..ac5950d 100644 --- a/.github/actions/build-plugin/action.yaml +++ b/.github/actions/build-plugin/action.yaml @@ -86,6 +86,8 @@ runs: } .github/scripts/Build-Windows.ps1 @BuildArgs + env: + CPU_OR_CUDA: ${{ inputs.cublas }} - name: Create Summary 📊 if: contains(fromJSON('["Linux", "macOS"]'),runner.os) diff --git a/.github/scripts/Package-Windows.ps1 b/.github/scripts/Package-Windows.ps1 index a09f54a..3d1a07c 100644 --- a/.github/scripts/Package-Windows.ps1 +++ b/.github/scripts/Package-Windows.ps1 @@ -49,7 +49,12 @@ function Package { $BuildSpec = Get-Content -Path ${BuildSpecFile} -Raw | ConvertFrom-Json $ProductName = $BuildSpec.name $ProductVersion = $BuildSpec.version - $CudaName = "cuda${Cublas}" + # Check if $cublas is cpu or cuda + if ( $Cublas -eq 'cpu' ) { + $CudaName = 'cpu' + } else { + $CudaName = "cuda${Cublas}" + } $OutputName = "${ProductName}-${ProductVersion}-windows-${Target}-${CudaName}" diff --git a/CMakeLists.txt b/CMakeLists.txt index ac7339f..fdd7cd1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,10 @@ endif() include(cmake/BuildWhispercpp.cmake) target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE Whispercpp) +include(cmake/BuildCTranslate2.cmake) +include(cmake/BuildSentencepiece.cmake) +target_link_libraries(${CMAKE_PROJECT_NAME} PRIVATE ct2 sentencepiece) + target_sources( ${CMAKE_PROJECT_NAME} PRIVATE src/plugin-main.c @@ -65,6 +69,8 @@ target_sources( src/whisper-utils/whisper-processing.cpp src/model-utils/model-downloader.cpp src/model-utils/model-downloader-ui.cpp - src/whisper-utils/whisper-utils.cpp) + src/model-utils/model-infos.cpp + src/whisper-utils/whisper-utils.cpp + src/translation/translation.cpp) set_target_properties_plugin(${CMAKE_PROJECT_NAME} PROPERTIES OUTPUT_NAME ${_name}) diff --git a/CMakePresets.json b/CMakePresets.json index bc80925..053671c 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -6,10 +6,19 @@ "patch": 0 }, "configurePresets": [ + { + "name": "template", + "hidden": true, + "cacheVariables": { + "ENABLE_FRONTEND_API": true, + "ENABLE_QT": true + } + }, { "name": "macos", "displayName": "macOS Universal", "description": "Build for macOS 11.0+ (Universal binary)", + "inherits": ["template"], "binaryDir": "${sourceDir}/build_macos", "condition": { "type": "equals", @@ -17,14 +26,12 @@ "rhs": "Darwin" }, "generator": "Xcode", - "warnings": {"dev": true, "deprecated": true}, + "warnings": { "dev": true, "deprecated": true }, "cacheVariables": { "QT_VERSION": "6", "CMAKE_OSX_DEPLOYMENT_TARGET": "11.0", "CODESIGN_IDENTITY": "$penv{CODESIGN_IDENT}", - "CODESIGN_TEAM": "$penv{CODESIGN_TEAM}", - "ENABLE_FRONTEND_API": true, - "ENABLE_QT": true + "CODESIGN_TEAM": "$penv{CODESIGN_TEAM}" } }, { @@ -41,6 +48,7 @@ "name": "windows-x64", "displayName": "Windows x64", "description": "Build for Windows x64", + "inherits": ["template"], "binaryDir": "${sourceDir}/build_x64", "condition": { "type": "equals", @@ -49,12 +57,10 @@ }, "generator": "Visual Studio 17 2022", "architecture": "x64", - "warnings": {"dev": true, "deprecated": true}, + "warnings": { "dev": true, "deprecated": true }, "cacheVariables": { "QT_VERSION": "6", - "CMAKE_SYSTEM_VERSION": "10.0.18363.657", - "ENABLE_FRONTEND_API": true, - "ENABLE_QT": true + "CMAKE_SYSTEM_VERSION": "10.0.18363.657" } }, { @@ -70,6 +76,7 @@ "name": "linux-x86_64", "displayName": "Linux x86_64", "description": "Build for Linux x86_64", + "inherits": ["template"], "binaryDir": "${sourceDir}/build_x86_64", "condition": { "type": "equals", @@ -77,12 +84,10 @@ "rhs": "Linux" }, "generator": "Ninja", - "warnings": {"dev": true, "deprecated": true}, + "warnings": { "dev": true, "deprecated": true }, "cacheVariables": { "QT_VERSION": "6", - "CMAKE_BUILD_TYPE": "RelWithDebInfo", - "ENABLE_FRONTEND_API": true, - "ENABLE_QT": true + "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { @@ -99,6 +104,7 @@ "name": "linux-aarch64", "displayName": "Linux aarch64", "description": "Build for Linux aarch64", + "inherits": ["template"], "binaryDir": "${sourceDir}/build_aarch64", "condition": { "type": "equals", @@ -106,12 +112,10 @@ "rhs": "Linux" }, "generator": "Ninja", - "warnings": {"dev": true, "deprecated": true}, + "warnings": { "dev": true, "deprecated": true }, "cacheVariables": { "QT_VERSION": "6", - "CMAKE_BUILD_TYPE": "RelWithDebInfo", - "ENABLE_FRONTEND_API": true, - "ENABLE_QT": true + "CMAKE_BUILD_TYPE": "RelWithDebInfo" } }, { diff --git a/buildspec.json b/buildspec.json index acb2be7..1c349f5 100644 --- a/buildspec.json +++ b/buildspec.json @@ -1,33 +1,33 @@ { "dependencies": { "obs-studio": { - "version": "29.1.2", + "version": "30.0.2", "baseUrl": "https://github.com/obsproject/obs-studio/archive/refs/tags", "label": "OBS sources", "hashes": { - "macos": "215f1fa5772c5dd9f3d6e35b0cb573912b00320149666a77864f9d305525504b", - "windows-x64": "46d451f3f42b9d2c59339ec268165849c7b7904cdf1cc2a8d44c015815a9e37d" + "macos": "be12c3ad0a85713750d8325e4b1db75086223402d7080d0e3c2833d7c5e83c27", + "windows-x64": "970058c49322cfa9cd6d620abb393fed89743ba7e74bd9dbb6ebe0ea8141d9c7" } }, "prebuilt": { - "version": "2023-04-12", + "version": "2023-11-03", "baseUrl": "https://github.com/obsproject/obs-deps/releases/download", "label": "Pre-Built obs-deps", "hashes": { - "macos": "9535c6e1ad96f7d49960251e85a245774088d48da1d602bb82f734b10219125a", - "windows-x64": "c13a14a1acc4224b21304d97b63da4121de1ed6981297e50496fbc474abc0503" + "macos": "90c2fc069847ec2768dcc867c1c63b112c615ed845a907dc44acab7a97181974", + "windows-x64": "d0825a6fb65822c993a3059edfba70d72d2e632ef74893588cf12b1f0d329ce6" } }, "qt6": { - "version": "2023-04-12", + "version": "2023-11-03", "baseUrl": "https://github.com/obsproject/obs-deps/releases/download", "label": "Pre-Built Qt6", "hashes": { - "macos": "eb7614544ab4f3d2c6052c797635602280ca5b028a6b987523d8484222ce45d1", - "windows-x64": "4d39364b8a8dee5aa24fcebd8440d5c22bb4551c6b440ffeacce7d61f2ed1add" + "macos": "ba4a7152848da0053f63427a2a2cb0a199af3992997c0db08564df6f48c9db98", + "windows-x64": "bc57dedf76b47119a6dce0435a2f21b35b08c8f2948b1cb34a157320f77732d1" }, "debugSymbols": { - "windows-x64": "f34ee5067be19ed370268b15c53684b7b8aaa867dc800b68931df905d679e31f" + "windows-x64": "fd8ecd1d8cd2ef049d9f4d7fb5c134f784836d6020758094855dfa98bd025036" } } }, @@ -45,7 +45,7 @@ } }, "name": "obs-localvocal", - "version": "0.2.0", + "version": "0.2.1", "author": "Roy Shilkrot", "website": "https://github.com/occ-ai/obs-localvocal", "email": "roy.shil@gmail.com", diff --git a/cmake/BuildCTranslate2.cmake b/cmake/BuildCTranslate2.cmake new file mode 100644 index 0000000..df48fdc --- /dev/null +++ b/cmake/BuildCTranslate2.cmake @@ -0,0 +1,104 @@ +# build the CTranslate2 library from source https://github.com/OpenNMT/CTranslate2.git + +include(ExternalProject) +include(FetchContent) + +if(APPLE) + + FetchContent_Declare( + ctranslate2_fetch + URL https://github.com/occ-ai/obs-ai-ctranslate2-dep/releases/download/1.1.1/libctranslate2-macos-Release-1.1.1.tar.gz + URL_HASH SHA256=da04d88ecc1ea105f8ee672e4eab33af96e50c999c5cc8170e105e110392182b) + FetchContent_MakeAvailable(ctranslate2_fetch) + + add_library(ct2 INTERFACE) + target_link_libraries(ct2 INTERFACE "-framework Accelerate" ${ctranslate2_fetch_SOURCE_DIR}/lib/libctranslate2.a + ${ctranslate2_fetch_SOURCE_DIR}/lib/libcpu_features.a) + set_target_properties(ct2 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ctranslate2_fetch_SOURCE_DIR}/include) + target_compile_options(ct2 INTERFACE -Wno-shorten-64-to-32) + +elseif(WIN32) + + # check CPU_OR_CUDA environment variable + if(NOT DEFINED ENV{CPU_OR_CUDA}) + message(FATAL_ERROR "Please set the CPU_OR_CUDA environment variable to either CPU or CUDA") + endif() + + if($ENV{CPU_OR_CUDA} STREQUAL "cpu") + FetchContent_Declare( + ctranslate2_fetch + URL https://github.com/occ-ai/obs-ai-ctranslate2-dep/releases/download/1.2.0/libctranslate2-windows-4.1.1-Release-cpu.zip + URL_HASH SHA256=30ff8b2499b8d3b5a6c4d6f7f8ddbc89e745ff06e0050b645e3b7c9b369451a3) + else() + # add compile definitions for CUDA + add_compile_definitions(POLYGLOT_WITH_CUDA) + add_compile_definitions(POLYGLOT_CUDA_VERSION=$ENV{CPU_OR_CUDA}) + + if($ENV{CPU_OR_CUDA} STREQUAL "12.2.0") + FetchContent_Declare( + ctranslate2_fetch + URL https://github.com/occ-ai/obs-ai-ctranslate2-dep/releases/download/1.2.0/libctranslate2-windows-4.1.1-Release-cuda12.2.0.zip + URL_HASH SHA256=131724d510f9f2829970953a1bc9e4e8fb7b4cbc8218e32270dcfe6172a51558) + elseif($ENV{CPU_OR_CUDA} STREQUAL "11.8.0") + FetchContent_Declare( + ctranslate2_fetch + URL https://github.com/occ-ai/obs-ai-ctranslate2-dep/releases/download/1.2.0/libctranslate2-windows-4.1.1-Release-cuda11.8.0.zip + URL_HASH SHA256=a120bee82f821df35a4646add30ac18b5c23e4e16b56fa7ba338eeae336e0d81) + else() + message(FATAL_ERROR "Unsupported CUDA version: $ENV{CPU_OR_CUDA}") + endif() + endif() + + FetchContent_MakeAvailable(ctranslate2_fetch) + + add_library(ct2 INTERFACE) + target_link_libraries(ct2 INTERFACE ${ctranslate2_fetch_SOURCE_DIR}/lib/ctranslate2.lib) + set_target_properties(ct2 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ctranslate2_fetch_SOURCE_DIR}/include) + target_compile_options(ct2 INTERFACE /wd4267 /wd4244 /wd4305 /wd4996 /wd4099) + + file(GLOB CT2_DLLS ${ctranslate2_fetch_SOURCE_DIR}/bin/*.dll) + install(FILES ${CT2_DLLS} DESTINATION "obs-plugins/64bit") +else() + set(CT2_VERSION "4.1.1") + set(CT2_URL "https://github.com/OpenNMT/CTranslate2.git") + set(CT2_OPENBLAS_CMAKE_ARGS -DWITH_OPENBLAS=OFF) + + set(CT2_CMAKE_PLATFORM_OPTIONS -DBUILD_SHARED_LIBS=OFF -DOPENMP_RUNTIME=NONE -DCMAKE_POSITION_INDEPENDENT_CODE=ON) + set(CT2_LIB_INSTALL_LOCATION lib/${CMAKE_SHARED_LIBRARY_PREFIX}ctranslate2${CMAKE_STATIC_LIBRARY_SUFFIX}) + + ExternalProject_Add( + ct2_build + GIT_REPOSITORY ${CT2_URL} + GIT_TAG v${CT2_VERSION} + GIT_PROGRESS 1 + BUILD_COMMAND ${CMAKE_COMMAND} --build --config ${CMAKE_BUILD_TYPE} + CMAKE_GENERATOR ${CMAKE_GENERATOR} + INSTALL_COMMAND ${CMAKE_COMMAND} --install --config ${CMAKE_BUILD_TYPE} + BUILD_BYPRODUCTS /${CT2_LIB_INSTALL_LOCATION} + CMAKE_ARGS -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM} + -DCMAKE_INSTALL_PREFIX= + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DWITH_CUDA=OFF + -DWITH_MKL=OFF + -DWITH_TESTS=OFF + -DWITH_EXAMPLES=OFF + -DWITH_TFLITE=OFF + -DWITH_TRT=OFF + -DWITH_PYTHON=OFF + -DWITH_SERVER=OFF + -DWITH_COVERAGE=OFF + -DWITH_PROFILING=OFF + -DBUILD_CLI=OFF + ${CT2_OPENBLAS_CMAKE_ARGS} + ${CT2_CMAKE_PLATFORM_OPTIONS}) + ExternalProject_Get_Property(ct2_build INSTALL_DIR) + + add_library(ct2::ct2 STATIC IMPORTED GLOBAL) + add_dependencies(ct2::ct2 ct2_build) + set_target_properties(ct2::ct2 PROPERTIES IMPORTED_LOCATION ${INSTALL_DIR}/${CT2_LIB_INSTALL_LOCATION}) + + add_library(ct2 INTERFACE) + target_link_libraries(ct2 INTERFACE ct2::ct2) + set_target_properties(ct2::ct2 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${INSTALL_DIR}/include) + +endif() diff --git a/cmake/BuildSentencepiece.cmake b/cmake/BuildSentencepiece.cmake new file mode 100644 index 0000000..024283e --- /dev/null +++ b/cmake/BuildSentencepiece.cmake @@ -0,0 +1,61 @@ +# build sentencepiece from "https://github.com/google/sentencepiece.git" + +if(APPLE) + + include(FetchContent) + + FetchContent_Declare( + sentencepiece_fetch + URL https://github.com/occ-ai/obs-ai-ctranslate2-dep/releases/download/1.1.1/libsentencepiece-macos-Release-1.1.1.tar.gz + URL_HASH SHA256=c911f1e84ea94925a8bc3fd3257185b2e18395075509c8659cc7003a979e0b32) + FetchContent_MakeAvailable(sentencepiece_fetch) + add_library(sentencepiece INTERFACE) + target_link_libraries(sentencepiece INTERFACE ${sentencepiece_fetch_SOURCE_DIR}/lib/libsentencepiece.a) + set_target_properties(sentencepiece PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ${sentencepiece_fetch_SOURCE_DIR}/include) +elseif(WIN32) + + FetchContent_Declare( + sentencepiece_fetch + URL https://github.com/occ-ai/obs-ai-ctranslate2-dep/releases/download/1.1.1/sentencepiece-windows-0.2.0-Release.zip + URL_HASH SHA256=846699c7fa1e8918b71ed7f2bd5cd60e47e51105e1d84e3192919b4f0f10fdeb) + FetchContent_MakeAvailable(sentencepiece_fetch) + add_library(sentencepiece INTERFACE) + target_link_libraries(sentencepiece INTERFACE ${sentencepiece_fetch_SOURCE_DIR}/lib/sentencepiece.lib) + set_target_properties(sentencepiece PROPERTIES INTERFACE_INCLUDE_DIRECTORIES + ${sentencepiece_fetch_SOURCE_DIR}/include) + +else() + + set(SP_URL + "https://github.com/google/sentencepiece.git" + CACHE STRING "URL of sentencepiece repository") + + set(SP_CMAKE_OPTIONS -DSPM_ENABLE_SHARED=OFF) + set(SENTENCEPIECE_INSTALL_LIB_LOCATION lib/${CMAKE_STATIC_LIBRARY_PREFIX}sentencepiece${CMAKE_STATIC_LIBRARY_SUFFIX}) + + include(ExternalProject) + + ExternalProject_Add( + sentencepiece_build + GIT_REPOSITORY ${SP_URL} + GIT_TAG v0.1.99 + BUILD_COMMAND ${CMAKE_COMMAND} --build --config ${CMAKE_BUILD_TYPE} + CMAKE_GENERATOR ${CMAKE_GENERATOR} + INSTALL_COMMAND ${CMAKE_COMMAND} --install --config ${CMAKE_BUILD_TYPE} + BUILD_BYPRODUCTS /${SENTENCEPIECE_INSTALL_LIB_LOCATION} + CMAKE_ARGS -DCMAKE_GENERATOR_PLATFORM=${CMAKE_GENERATOR_PLATFORM} -DCMAKE_INSTALL_PREFIX= + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} ${SP_CMAKE_OPTIONS}) + ExternalProject_Get_Property(sentencepiece_build INSTALL_DIR) + + add_library(libsentencepiece STATIC IMPORTED GLOBAL) + add_dependencies(libsentencepiece sentencepiece_build) + set_target_properties(libsentencepiece PROPERTIES IMPORTED_LOCATION + ${INSTALL_DIR}/${SENTENCEPIECE_INSTALL_LIB_LOCATION}) + + add_library(sentencepiece INTERFACE) + add_dependencies(sentencepiece libsentencepiece) + target_link_libraries(sentencepiece INTERFACE libsentencepiece) + target_include_directories(sentencepiece INTERFACE ${INSTALL_DIR}/include) + +endif() diff --git a/cmake/macos/compilerconfig.cmake b/cmake/macos/compilerconfig.cmake index c40a532..524aab5 100644 --- a/cmake/macos/compilerconfig.cmake +++ b/cmake/macos/compilerconfig.cmake @@ -55,3 +55,4 @@ else() endif() add_compile_definitions($<$:DEBUG> $<$:_DEBUG> SIMDE_ENABLE_OPENMP) +add_compile_options(-Wno-error=newline-eof) diff --git a/data/locale/ar-SA.ini b/data/locale/ar-SA.ini new file mode 100644 index 0000000..be12610 --- /dev/null +++ b/data/locale/ar-SA.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="إضافة LocalVocal" +transcription_filterAudioFilter="تصفية نسخ LocalVocal" +vad_enabled="تمكين VAD" +log_level="مستوى السجل الداخلي" +log_words="تسجيل الخروج إلى الوحدة الطرفية" +caption_to_stream="تدفق الترجمات" +step_by_step_processing="المعالجة خطوة بخطوة (⚠️ زيادة المعالجة)" +step_size_msec="حجم الخطوة (ملي ثانية)" +subtitle_sources="مصادر الترجمات" +none_no_output="بدون / بلا مخرجات" +text_file_output="مخرجات ملف نصي" +output_filename="اسم ملف الخروج" +whisper_model="نموذج Whisper" +external_model_file="ملف النموذج الخارجي" +whisper_parameters="الإعدادات المتقدمة" +language="اللغة" +whisper_sampling_method="طريقة عينة Whisper" +n_threads="عدد الخيوط" +n_max_text_ctx="الحد الأقصى لسياق النص" +translate="ترجمة" +no_context="بدون سياق" +single_segment="جزء واحد" +print_special="طباعة خاصة" +print_progress="طباعة التقدم" +print_realtime="طباعة الوقت الفعلي" +print_timestamps="طباعة الطوابع الزمنية" +token_timestamps="طوابع زمنية للرمز" +thold_pt="عتبة احتمال الرمز" +thold_ptsum="عتبة مجموع احتمال الرمز" +max_len="الحد الأقصى للطول بالأحرف" +split_on_word="التقسيم على الكلمة" +max_tokens="الحد الأقصى للرموز" +speed_up="تسريع" +initial_prompt="المطالبة الأولية" +suppress_blank="كبت الفراغ" +suppress_non_speech_tokens="كبت رموز غير الكلام" +temperature="درجة الحرارة" +max_initial_ts="الحد الأقصى للطوابع الزمنية الأولية" +length_penalty="عقوبة الطول" +save_srt="حفظ بصيغة SRT" +truncate_output_file="تقليص الملف عند جملة جديدة" +only_while_recording="كتابة الخروج فقط أثناء التسجيل" +process_while_muted="معالجة الكلام أثناء كتم المصدر" +rename_file_to_match_recording="إعادة تسمية الملف ليتطابق مع التسجيل" +min_sub_duration="الحد الأدنى لمدة العنوان الفرعي (ملي ثانية)" +advanced_settings="الإعدادات المتقدمة" +target_language="اللغة الهدف" +source_language="لغة المصدر" +translate="ترجمة" +translate_add_context="الترجمة مع السياق" diff --git a/data/locale/de-DE.ini b/data/locale/de-DE.ini new file mode 100644 index 0000000..57bb71a --- /dev/null +++ b/data/locale/de-DE.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="LocalVocal Plugin" +transcription_filterAudioFilter="LocalVocal Transkription" +vad_enabled="VAD Aktiviert" +log_level="Interne Protokollebene" +log_words="Protokollausgabe zur Konsole" +caption_to_stream="Stream-Untertitel" +step_by_step_processing="Schritt-für-Schritt-Verarbeitung (⚠️ erhöhte Verarbeitung)" +step_size_msec="Schrittgröße (ms)" +subtitle_sources="Untertitel Ausgabe" +none_no_output="Keine / Keine Ausgabe" +text_file_output="Textdatei Ausgabe" +output_filename="Ausgabedateiname" +whisper_model="Flüstermodell" +external_model_file="Externe Modelldatei" +whisper_parameters="Erweiterte Einstellungen" +language="Sprache" +whisper_sampling_method="Flüster Sampling Methode" +n_threads="Anzahl der Threads" +n_max_text_ctx="Max Textkontext" +translate="Übersetzen" +no_context="Kein Kontext" +single_segment="Einzelnes Segment" +print_special="Sonderdruck" +print_progress="Fortschritt drucken" +print_realtime="Echtzeit drucken" +print_timestamps="Zeitstempel drucken" +token_timestamps="Token Zeitstempel" +thold_pt="Token-Wahrscheinlichkeitsschwelle" +thold_ptsum="Token Summenwahrscheinlichkeitsschwelle" +max_len="Maximale Länge in Zeichen" +split_on_word="Auf Wort teilen" +max_tokens="Max Tokens" +speed_up="Beschleunigen" +initial_prompt="Erste Aufforderung" +suppress_blank="Leerzeichen unterdrücken" +suppress_non_speech_tokens="Nicht-Sprach-Token unterdrücken" +temperature="Temperatur" +max_initial_ts="Max Anfangszeitstempel" +length_penalty="Längenstrafe" +save_srt="Im SRT-Format speichern" +truncate_output_file="Datei bei neuem Satz kürzen" +only_while_recording="Ausgabe nur während der Aufnahme schreiben" +process_while_muted="Sprache verarbeiten, während die Quelle stummgeschaltet ist" +rename_file_to_match_recording="Datei umbenennen, um Aufnahme zu entsprechen" +min_sub_duration="Min. Untertiteldauer (ms)" +advanced_settings="Erweiterte Einstellungen" +target_language="Zielsprache" +source_language="Quellsprache" +translate="Übersetzen" +translate_add_context="Mit Kontext übersetzen" diff --git a/data/locale/en-US.ini b/data/locale/en-US.ini index 31a2293..fab4510 100644 --- a/data/locale/en-US.ini +++ b/data/locale/en-US.ini @@ -44,3 +44,7 @@ process_while_muted="Process speech while source is muted" rename_file_to_match_recording="Rename file to match recording" min_sub_duration="Min. sub duration (ms)" advanced_settings="Advanced Settings" +target_language="Target language" +source_language="Source language" +translate="Translate" +translate_add_context="Translate with context" diff --git a/data/locale/es-ES.ini b/data/locale/es-ES.ini new file mode 100644 index 0000000..a9f0580 --- /dev/null +++ b/data/locale/es-ES.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="Plugin LocalVocal" +transcription_filterAudioFilter="Transcripción LocalVocal" +vad_enabled="VAD Habilitado" +log_level="Nivel de Registro Interno" +log_words="Registro de Salida a la Consola" +caption_to_stream="Subtítulos en Stream" +step_by_step_processing="Procesamiento paso a paso (⚠️ procesamiento aumentado)" +step_size_msec="Tamaño de paso (ms)" +subtitle_sources="Salida de Subtítulos" +none_no_output="Ninguno / Sin salida" +text_file_output="Salida de archivo de texto" +output_filename="Nombre del archivo de salida" +whisper_model="Modelo Whisper" +external_model_file="Archivo de modelo externo" +whisper_parameters="Configuraciones Avanzadas" +language="Idioma" +whisper_sampling_method="Método de Muestreo Whisper" +n_threads="Número de hilos" +n_max_text_ctx="Contexto de texto máximo" +translate="Traducir" +no_context="Sin contexto" +single_segment="Segmento único" +print_special="Imprimir especial" +print_progress="Imprimir progreso" +print_realtime="Imprimir en tiempo real" +print_timestamps="Imprimir marcas de tiempo" +token_timestamps="Marcas de tiempo de token" +thold_pt="Umbral de prob. de token" +thold_ptsum="Umbral de suma de prob. de token" +max_len="Longitud máxima en caracteres" +split_on_word="Dividir en palabra" +max_tokens="Tokens máximos" +speed_up="Acelerar" +initial_prompt="Indicación inicial" +suppress_blank="Suprimir en blanco" +suppress_non_speech_tokens="Suprimir tokens no verbales" +temperature="Temperatura" +max_initial_ts="Marcas de tiempo iniciales máximas" +length_penalty="Penalización de longitud" +save_srt="Guardar en formato SRT" +truncate_output_file="Truncar archivo en nueva oración" +only_while_recording="Escribir salida solo mientras se graba" +process_while_muted="Procesar el habla mientras la fuente está silenciada" +rename_file_to_match_recording="Renombrar archivo para que coincida con la grabación" +min_sub_duration="Duración mínima de sub (ms)" +advanced_settings="Configuraciones Avanzadas" +target_language="Idioma objetivo" +source_language="Idioma fuente" +translate="Traducir" +translate_add_context="Traducir con contexto" diff --git a/data/locale/fr-FR.ini b/data/locale/fr-FR.ini new file mode 100644 index 0000000..13a00d9 --- /dev/null +++ b/data/locale/fr-FR.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="Plugin LocalVocal" +transcription_filterAudioFilter="Transcription LocalVocal" +vad_enabled="VAD Activé" +log_level="Niveau de journalisation interne" +log_words="Journalisation de la sortie vers la console" +caption_to_stream="Sous-titres en streaming" +step_by_step_processing="Traitement étape par étape (⚠️ traitement accru)" +step_size_msec="Taille de l'étape (ms)" +subtitle_sources="Sortie des sous-titres" +none_no_output="Aucun / Pas de sortie" +text_file_output="Sortie de fichier texte" +output_filename="Nom du fichier de sortie" +whisper_model="Modèle Whisper" +external_model_file="Fichier de modèle externe" +whisper_parameters="Paramètres avancés" +language="Langue" +whisper_sampling_method="Méthode d'échantillonnage Whisper" +n_threads="Nombre de fils" +n_max_text_ctx="Contexte de texte max" +translate="Traduire" +no_context="Pas de contexte" +single_segment="Segment unique" +print_special="Imprimer spécial" +print_progress="Imprimer la progression" +print_realtime="Imprimer en temps réel" +print_timestamps="Imprimer les horodatages" +token_timestamps="Horodatages des jetons" +thold_pt="Seuil de prob. de jeton" +thold_ptsum="Seuil de somme de prob. de jeton" +max_len="Longueur max en caractères" +split_on_word="Diviser sur le mot" +max_tokens="Max jetons" +speed_up="Accélérer" +initial_prompt="Invite initiale" +suppress_blank="Supprimer le blanc" +suppress_non_speech_tokens="Supprimer les jetons non-parlés" +temperature="Température" +max_initial_ts="Max horodatages initiaux" +length_penalty="Pénalité de longueur" +save_srt="Enregistrer au format SRT" +truncate_output_file="Tronquer le fichier sur nouvelle phrase" +only_while_recording="Écrire la sortie uniquement pendant l'enregistrement" +process_while_muted="Traiter la parole pendant que la source est en sourdine" +rename_file_to_match_recording="Renommer le fichier pour correspondre à l'enregistrement" +min_sub_duration="Durée min. du sous-titre (ms)" +advanced_settings="Paramètres avancés" +target_language="Langue cible" +source_language="Langue source" +translate="Traduire" +translate_add_context="Traduire avec contexte" diff --git a/data/locale/hi-IN.ini b/data/locale/hi-IN.ini new file mode 100644 index 0000000..03d8c27 --- /dev/null +++ b/data/locale/hi-IN.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="लोकलवोकल प्लगइन" +transcription_filterAudioFilter="लोकलवोकल ट्रांसक्रिप्शन" +vad_enabled="VAD सक्षम" +log_level="आंतरिक लॉग स्तर" +log_words="कंसोल पर लॉग आउटपुट" +caption_to_stream="स्ट्रीम कैप्शन" +step_by_step_processing="चरण-दर-चरण प्रसंस्करण (⚠️ बढ़ी प्रसंस्करण)" +step_size_msec="चरण का आकार (ms)" +subtitle_sources="उपशीर्षक आउटपुट" +none_no_output="कोई नहीं / कोई आउटपुट नहीं" +text_file_output="टेक्स्ट फ़ाइल आउटपुट" +output_filename="आउटपुट फ़ाइलनाम" +whisper_model="व्हिस्पर मॉडल" +external_model_file="बाहरी मॉडल फ़ाइल" +whisper_parameters="उन्नत सेटिंग्स" +language="भाषा" +whisper_sampling_method="व्हिस्पर सैंपलिंग विधि" +n_threads="धागों की संख्या" +n_max_text_ctx="अधिकतम पाठ संदर्भ" +translate="अनुवाद करें" +no_context="कोई संदर्भ नहीं" +single_segment="एकल सेगमेंट" +print_special="विशेष मुद्रित करें" +print_progress="प्रगति मुद्रित करें" +print_realtime="रियलटाइम मुद्रित करें" +print_timestamps="टाइमस्टैंप मुद्रित करें" +token_timestamps="टोकन टाइमस्टैंप" +thold_pt="टोकन प्रॉब. थ्रेशोल्ड" +thold_ptsum="टोकन सम प्रॉब. थ्रेशोल्ड" +max_len="अधिकतम लंबाई इन अक्षरों में" +split_on_word="शब्द पर विभाजित करें" +max_tokens="अधिकतम टोकन" +speed_up="स्पीड अप" +initial_prompt="प्रारंभिक प्रॉम्प्ट" +suppress_blank="रिक्त संयंत्रित करें" +suppress_non_speech_tokens="गैर-भाषण टोकनों को दबाएं" +temperature="तापमान" +max_initial_ts="अधिकतम प्रारंभिक टाइमस्टैंप" +length_penalty="लंबाई दंड" +save_srt="SRT प्रारूप में सहेजें" +truncate_output_file="नई वाक्यांश पर फ़ाइल को छोटा करें" +only_while_recording="केवल रिकॉर्डिंग के दौरान आउटपुट लिखें" +process_while_muted="स्रोत म्यूट होने पर भी भाषण को प्रसंस्करण करें" +rename_file_to_match_recording="रिकॉर्डिंग से मेल खाने के लिए फ़ाइल का नाम बदलें" +min_sub_duration="न्यूनतम उपशीर्षक अवधि (ms)" +advanced_settings="उन्नत सेटिंग्स" +target_language="लक्ष्य भाषा" +source_language="स्रोत भाषा" +translate="अनुवाद करें" +translate_add_context="संदर्भ के साथ अनुवाद करें" diff --git a/data/locale/ja-JP.ini b/data/locale/ja-JP.ini new file mode 100644 index 0000000..d7fc6d3 --- /dev/null +++ b/data/locale/ja-JP.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="ローカルボーカルプラグイン" +transcription_filterAudioFilter="ローカルボーカルトランスクリプション" +vad_enabled="VAD有効" +log_level="内部ログレベル" +log_words="コンソールへのログ出力" +caption_to_stream="ストリームキャプション" +step_by_step_processing="ステップバイステップ処理(⚠️処理増加)" +step_size_msec="ステップサイズ(ms)" +subtitle_sources="字幕出力" +none_no_output="なし/出力なし" +text_file_output="テキストファイル出力" +output_filename="出力ファイル名" +whisper_model="ウィスパーモデル" +external_model_file="外部モデルファイル" +whisper_parameters="詳細設定" +language="言語" +whisper_sampling_method="ウィスパーサンプリング方法" +n_threads="スレッド数" +n_max_text_ctx="最大テキストコンテキスト" +translate="翻訳" +no_context="コンテキストなし" +single_segment="単一セグメント" +print_special="特別な印刷" +print_progress="進行状況を印刷" +print_realtime="リアルタイムで印刷" +print_timestamps="タイムスタンプを印刷" +token_timestamps="トークンタイムスタンプ" +thold_pt="トークン確率閾値" +thold_ptsum="トークン合計確率閾値" +max_len="最大長(文字)" +split_on_word="単語で分割" +max_tokens="最大トークン数" +speed_up="スピードアップ" +initial_prompt="初期プロンプト" +suppress_blank="空白を抑制" +suppress_non_speech_tokens="非音声トークンを抑制" +temperature="温度" +max_initial_ts="最大初期タイムスタンプ" +length_penalty="長さのペナルティ" +save_srt="SRT形式で保存" +truncate_output_file="新しい文でファイルを切り捨てる" +only_while_recording="録音中のみ出力を書き込む" +process_while_muted="ソースがミュート中も音声を処理する" +rename_file_to_match_recording="ファイル名を録音に合わせて変更" +min_sub_duration="最小サブ持続時間(ms)" +advanced_settings="詳細設定" +target_language="目標言語" +source_language="ソース言語" +translate="翻訳" +translate_add_context="コンテキスト付きで翻訳" diff --git a/data/locale/ko-KR.ini b/data/locale/ko-KR.ini new file mode 100644 index 0000000..12d714c --- /dev/null +++ b/data/locale/ko-KR.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="로컬보컬 플러그인" +transcription_filterAudioFilter="로컬보컬 전사" +vad_enabled="VAD 활성화" +log_level="내부 로그 레벨" +log_words="콘솔에 로그 출력" +caption_to_stream="스트림 캡션" +step_by_step_processing="단계별 처리 (⚠️ 처리 시간 증가)" +step_size_msec="단계 크기 (ms)" +subtitle_sources="자막 출력" +none_no_output="없음 / 출력 없음" +text_file_output="텍스트 파일 출력" +output_filename="출력 파일명" +whisper_model="속삭임 모델" +external_model_file="외부 모델 파일" +whisper_parameters="고급 설정" +language="언어" +whisper_sampling_method="속삭임 샘플링 방법" +n_threads="스레드 수" +n_max_text_ctx="최대 텍스트 컨텍스트" +translate="번역" +no_context="컨텍스트 없음" +single_segment="단일 세그먼트" +print_special="특수 출력" +print_progress="진행 상황 출력" +print_realtime="실시간 출력" +print_timestamps="타임스탬프 출력" +token_timestamps="토큰 타임스탬프" +thold_pt="토큰 확률 임계값" +thold_ptsum="토큰 합 확률 임계값" +max_len="최대 길이(문자)" +split_on_word="단어로 분할" +max_tokens="최대 토큰" +speed_up="속도 향상" +initial_prompt="초기 프롬프트" +suppress_blank="공백 제거" +suppress_non_speech_tokens="비음성 토큰 제거" +temperature="온도" +max_initial_ts="최대 초기 타임스탬프" +length_penalty="길이 패널티" +save_srt="SRT 형식으로 저장" +truncate_output_file="새 문장에서 파일 잘라내기" +only_while_recording="녹음 중에만 출력 작성" +process_while_muted="소스가 음소거 상태일 때 음성 처리" +rename_file_to_match_recording="녹음과 일치하도록 파일 이름 변경" +min_sub_duration="최소. 자막 지속 시간 (ms)" +advanced_settings="고급 설정" +target_language="대상 언어" +source_language="원본 언어" +translate="번역" +translate_add_context="컨텍스트와 함께 번역" diff --git a/data/locale/pl-PL.ini b/data/locale/pl-PL.ini new file mode 100644 index 0000000..d550b18 --- /dev/null +++ b/data/locale/pl-PL.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="Wtyczka LocalVocal" +transcription_filterAudioFilter="Transkrypcja LocalVocal" +vad_enabled="VAD Włączony" +log_level="Poziom logowania wewnętrznego" +log_words="Logowanie wyjścia do konsoli" +caption_to_stream="Podpisy strumienia" +step_by_step_processing="Przetwarzanie krok po kroku (⚠️ zwiększone przetwarzanie)" +step_size_msec="Rozmiar kroku (ms)" +subtitle_sources="Źródła napisów" +none_no_output="Brak / Brak wyjścia" +text_file_output="Wyjście pliku tekstowego" +output_filename="Nazwa pliku wyjściowego" +whisper_model="Model Whisper" +external_model_file="Zewnętrzny plik modelu" +whisper_parameters="Ustawienia zaawansowane" +language="Język" +whisper_sampling_method="Metoda próbkowania Whisper" +n_threads="Liczba wątków" +n_max_text_ctx="Maksymalny kontekst tekstu" +translate="Tłumacz" +no_context="Brak kontekstu" +single_segment="Pojedynczy segment" +print_special="Drukuj specjalne" +print_progress="Drukuj postęp" +print_realtime="Drukuj w czasie rzeczywistym" +print_timestamps="Drukuj znaczniki czasu" +token_timestamps="Znaczniki czasu tokenów" +thold_pt="Próg prawd. tokena" +thold_ptsum="Próg sumy prawd. tokena" +max_len="Maksymalna długość w znakach" +split_on_word="Podziel na słowo" +max_tokens="Maksymalna liczba tokenów" +speed_up="Przyspiesz" +initial_prompt="Początkowy monit" +suppress_blank="Tłumienie pustych" +suppress_non_speech_tokens="Tłumienie tokenów nie-mowy" +temperature="Temperatura" +max_initial_ts="Maksymalne początkowe znaczniki czasu" +length_penalty="Kara za długość" +save_srt="Zapisz w formacie SRT" +truncate_output_file="Skróć plik przy nowym zdaniu" +only_while_recording="Zapisuj wyjście tylko podczas nagrywania" +process_while_muted="Przetwarzaj mowę, gdy źródło jest wyciszone" +rename_file_to_match_recording="Zmień nazwę pliku, aby pasowała do nagrania" +min_sub_duration="Min. czas trwania napisów (ms)" +advanced_settings="Ustawienia zaawansowane" +target_language="Język docelowy" +source_language="Język źródłowy" +translate="Tłumacz" +translate_add_context="Tłumacz z kontekstem" diff --git a/data/locale/pt_BR.ini b/data/locale/pt-BR.ini similarity index 93% rename from data/locale/pt_BR.ini rename to data/locale/pt-BR.ini index f416835..6033e61 100644 --- a/data/locale/pt_BR.ini +++ b/data/locale/pt-BR.ini @@ -44,3 +44,7 @@ only_while_recording="Escreva durante a gravação" process_while_muted="Processar enquanto está silenciada" rename_file_to_match_recording="Renomear arquivo para corresponder à gravação" min_sub_duration="Duração mínima da legenda (msec)" +target_language="Língua alvo" +source_language="Língua de origem" +translate="Traduzir" +translate_add_context="Traduzir com contexto" diff --git a/data/locale/ru_RU.ini b/data/locale/ru-RU.ini similarity index 93% rename from data/locale/ru_RU.ini rename to data/locale/ru-RU.ini index 6d3ce3f..7918ad8 100644 --- a/data/locale/ru_RU.ini +++ b/data/locale/ru-RU.ini @@ -43,3 +43,7 @@ only_while_recording="Записывать вывод только во врем process_while_muted="Обрабатывать речь, пока источник отключен" rename_file_to_match_recording="Переименовать файл, чтобы соответствовать записи" min_sub_duration="Минимальная длительность субтитров (мс)" +target_language="Целевой язык" +source_language="Исходный язык" +translate="Перевести" +translate_add_context="Перевести с контекстом" diff --git a/data/locale/zh-CN.ini b/data/locale/zh-CN.ini new file mode 100644 index 0000000..2809ae7 --- /dev/null +++ b/data/locale/zh-CN.ini @@ -0,0 +1,50 @@ +LocalVocalPlugin="LocalVocal 插件" +transcription_filterAudioFilter="LocalVocal 转录" +vad_enabled="启用 VAD" +log_level="内部日志级别" +log_words="控制台日志输出" +caption_to_stream="流字幕" +step_by_step_processing="逐步处理(⚠️ 增加处理)" +step_size_msec="步长(毫秒)" +subtitle_sources="字幕输出" +none_no_output="无 / 无输出" +text_file_output="文本文件输出" +output_filename="输出文件名" +whisper_model="Whisper 模型" +external_model_file="外部模型文件" +whisper_parameters="高级设置" +language="语言" +whisper_sampling_method="Whisper 采样方法" +n_threads="线程数" +n_max_text_ctx="最大文本上下文" +translate="翻译" +no_context="无上下文" +single_segment="单一段落" +print_special="打印特殊" +print_progress="打印进度" +print_realtime="实时打印" +print_timestamps="打印时间戳" +token_timestamps="令牌时间戳" +thold_pt="令牌概率阈值" +thold_ptsum="令牌总概率阈值" +max_len="最大长度(字符)" +split_on_word="按单词分割" +max_tokens="最大令牌数" +speed_up="加速" +initial_prompt="初始提示" +suppress_blank="抑制空白" +suppress_non_speech_tokens="抑制非语音令牌" +temperature="温度" +max_initial_ts="最大初始时间戳" +length_penalty="长度惩罚" +save_srt="保存为 SRT 格式" +truncate_output_file="新句子时截断文件" +only_while_recording="仅在录制时写入输出" +process_while_muted="在源静音时处理语音" +rename_file_to_match_recording="将文件重命名以匹配录制" +min_sub_duration="最小字幕持续时间(毫秒)" +advanced_settings="高级设置" +target_language="目标语言" +source_language="源语言" +translate="翻译" +translate_add_context="带上下文翻译" diff --git a/data/models/ggml-model-whisper-tiny.en.bin b/data/models/ggml-model-whisper-tiny-en/ggml-model-whisper-tiny.en.bin similarity index 100% rename from data/models/ggml-model-whisper-tiny.en.bin rename to data/models/ggml-model-whisper-tiny-en/ggml-model-whisper-tiny.en.bin diff --git a/src/model-utils/model-downloader-types.h b/src/model-utils/model-downloader-types.h index 0ef81c3..3d24d96 100644 --- a/src/model-utils/model-downloader-types.h +++ b/src/model-utils/model-downloader-types.h @@ -1,3 +1,28 @@ +#ifndef MODEL_DOWNLOADER_TYPES_H +#define MODEL_DOWNLOADER_TYPES_H + +#include +#include +#include +#include typedef std::function download_finished_callback_t; + +struct ModelFileDownloadInfo { + std::string url; + std::string sha256; +}; + +enum ModelType { MODEL_TYPE_TRANSCRIPTION, MODEL_TYPE_TRANSLATION }; + +struct ModelInfo { + std::string friendly_name; + std::string local_folder_name; + ModelType type; + std::vector files; +}; + +extern std::map models_info; + +#endif /* MODEL_DOWNLOADER_TYPES_H */ diff --git a/src/model-utils/model-downloader-ui.cpp b/src/model-utils/model-downloader-ui.cpp index e53d0ab..023ccb6 100644 --- a/src/model-utils/model-downloader-ui.cpp +++ b/src/model-utils/model-downloader-ui.cpp @@ -5,15 +5,13 @@ #include -const std::string MODEL_BASE_PATH = "https://ggml.ggerganov.com/"; - size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) { size_t written = fwrite(ptr, size, nmemb, stream); return written; } -ModelDownloader::ModelDownloader(const std::string &model_name, +ModelDownloader::ModelDownloader(const ModelInfo &model_info, download_finished_callback_t download_finished_callback_, QWidget *parent) : QDialog(parent), @@ -30,7 +28,7 @@ ModelDownloader::ModelDownloader(const std::string &model_name, // Add a label for the model name QLabel *model_name_label = new QLabel(this); - model_name_label->setText(QString::fromStdString(model_name)); + model_name_label->setText(QString::fromStdString(model_info.friendly_name)); model_name_label->setAlignment(Qt::AlignCenter); this->layout->addWidget(model_name_label); @@ -43,7 +41,7 @@ ModelDownloader::ModelDownloader(const std::string &model_name, this->layout->addWidget(this->progress_bar); this->download_thread = new QThread(); - this->download_worker = new ModelDownloadWorker(model_name); + this->download_worker = new ModelDownloadWorker(model_info); this->download_worker->moveToThread(this->download_thread); connect(this->download_thread, &QThread::started, this->download_worker, @@ -112,65 +110,96 @@ void ModelDownloader::show_error(const std::string &reason) this->download_finished_callback(1, ""); } -ModelDownloadWorker::ModelDownloadWorker(const std::string &model_name_) +ModelDownloadWorker::ModelDownloadWorker(const ModelInfo &model_info_) : model_info(model_info_) {} + +std::string get_filename_from_url(const std::string &url) { - this->model_name = model_name_; + auto lastSlashPos = url.find_last_of("/"); + auto queryPos = url.find("?", lastSlashPos); + if (queryPos == std::string::npos) { + return url.substr(lastSlashPos + 1); + } else { + return url.substr(lastSlashPos + 1, queryPos - lastSlashPos - 1); + } } void ModelDownloadWorker::download_model() { - char *module_config_path = obs_module_get_config_path(obs_current_module(), "models"); + char *config_folder = obs_module_get_config_path(obs_current_module(), "models"); + const std::filesystem::path module_config_models_folder = + std::filesystem::absolute(config_folder); + bfree(config_folder); + // Check if the config folder exists - if (!std::filesystem::exists(module_config_path)) { - obs_log(LOG_WARNING, "Config folder does not exist: %s", module_config_path); + if (!std::filesystem::exists(module_config_models_folder)) { + obs_log(LOG_WARNING, "Config folder does not exist: %s", + module_config_models_folder.string().c_str()); // Create the config folder - if (!std::filesystem::create_directories(module_config_path)) { + if (!std::filesystem::create_directories(module_config_models_folder)) { obs_log(LOG_ERROR, "Failed to create config folder: %s", - module_config_path); + module_config_models_folder.string().c_str()); emit download_error("Failed to create config folder."); return; } } - char *model_save_path_str = - obs_module_get_config_path(obs_current_module(), this->model_name.c_str()); - std::string model_save_path(model_save_path_str); - bfree(model_save_path_str); - obs_log(LOG_INFO, "Model save path: %s", model_save_path.c_str()); + const std::string model_local_config_path = + (module_config_models_folder / model_info.local_folder_name).string(); - // extract filename from path in this->modle_name - const std::string model_filename = - this->model_name.substr(this->model_name.find_last_of("/\\") + 1); + obs_log(LOG_INFO, "Model save path: %s", model_local_config_path.c_str()); - std::string model_url = MODEL_BASE_PATH + model_filename; - obs_log(LOG_INFO, "Model URL: %s", model_url.c_str()); + if (!std::filesystem::exists(model_local_config_path)) { + // model folder does not exist, create it + if (!std::filesystem::create_directories(model_local_config_path)) { + obs_log(LOG_ERROR, "Failed to create model folder: %s", + model_local_config_path.c_str()); + emit download_error("Failed to create model folder."); + return; + } + } CURL *curl = curl_easy_init(); if (curl) { - FILE *fp = fopen(model_save_path.c_str(), "wb"); - if (fp == nullptr) { - obs_log(LOG_ERROR, "Failed to open file %s.", model_save_path.c_str()); - emit download_error("Failed to open file."); - return; - } - curl_easy_setopt(curl, CURLOPT_URL, model_url.c_str()); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); - curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); - curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, - ModelDownloadWorker::progress_callback); - curl_easy_setopt(curl, CURLOPT_XFERINFODATA, this); - // Follow redirects - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - CURLcode res = curl_easy_perform(curl); - if (res != CURLE_OK) { - obs_log(LOG_ERROR, "Failed to download model %s.", - this->model_name.c_str()); - emit download_error("Failed to download model."); + for (auto &model_download_file : this->model_info.files) { + obs_log(LOG_INFO, "Model URL: %s", model_download_file.url.c_str()); + + const std::string model_filename = + get_filename_from_url(model_download_file.url); + const std::string model_file_save_path = + (std::filesystem::path(model_local_config_path) / model_filename) + .string(); + if (std::filesystem::exists(model_file_save_path)) { + obs_log(LOG_INFO, "Model file already exists: %s", + model_file_save_path.c_str()); + continue; + } + + FILE *fp = fopen(model_file_save_path.c_str(), "wb"); + if (fp == nullptr) { + obs_log(LOG_ERROR, "Failed to open model file for writing %s.", + model_file_save_path.c_str()); + emit download_error("Failed to open file."); + return; + } + curl_easy_setopt(curl, CURLOPT_URL, model_download_file.url.c_str()); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); + curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, + ModelDownloadWorker::progress_callback); + curl_easy_setopt(curl, CURLOPT_XFERINFODATA, this); + // Follow redirects + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + CURLcode res = curl_easy_perform(curl); + if (res != CURLE_OK) { + obs_log(LOG_ERROR, "Failed to download model file %s.", + model_filename.c_str()); + emit download_error("Failed to download model file."); + } + fclose(fp); } curl_easy_cleanup(curl); - fclose(fp); - emit download_finished(model_save_path); + emit download_finished(model_local_config_path); } else { obs_log(LOG_ERROR, "Failed to initialize curl."); emit download_error("Failed to initialize curl."); diff --git a/src/model-utils/model-downloader-ui.h b/src/model-utils/model-downloader-ui.h index d2e5fb2..20521b6 100644 --- a/src/model-utils/model-downloader-ui.h +++ b/src/model-utils/model-downloader-ui.h @@ -14,7 +14,7 @@ class ModelDownloadWorker : public QObject { Q_OBJECT public: - ModelDownloadWorker(const std::string &model_name); + ModelDownloadWorker(const ModelInfo &model_info_); ~ModelDownloadWorker(); public slots: @@ -28,13 +28,13 @@ public slots: private: static int progress_callback(void *clientp, curl_off_t dltotal, curl_off_t dlnow, curl_off_t ultotal, curl_off_t ulnow); - std::string model_name; + ModelInfo model_info; }; class ModelDownloader : public QDialog { Q_OBJECT public: - ModelDownloader(const std::string &model_name, + ModelDownloader(const ModelInfo &model_info, download_finished_callback_t download_finished_callback, QWidget *parent = nullptr); ~ModelDownloader(); diff --git a/src/model-utils/model-downloader.cpp b/src/model-utils/model-downloader.cpp index c83adff..ae3e8b1 100644 --- a/src/model-utils/model-downloader.cpp +++ b/src/model-utils/model-downloader.cpp @@ -12,46 +12,91 @@ #include -std::string find_model_file(const std::string &model_name) +std::string find_file_in_folder_by_name(const std::string &folder_path, + const std::string &file_name) { - const char *model_name_cstr = model_name.c_str(); - obs_log(LOG_INFO, "Checking if model %s exists in data...", model_name_cstr); + for (const auto &entry : std::filesystem::directory_iterator(folder_path)) { + if (entry.path().filename() == file_name) { + return entry.path().string(); + } + } + return ""; +} - char *model_file_path = obs_module_file(model_name_cstr); - if (model_file_path == nullptr) { - obs_log(LOG_INFO, "Model %s not found in data.", model_name_cstr); - } else { - std::string model_file_path_str(model_file_path); - bfree(model_file_path); - if (!std::filesystem::exists(model_file_path_str)) { - obs_log(LOG_INFO, "Model not found in data: %s", - model_file_path_str.c_str()); - } else { - obs_log(LOG_INFO, "Model found in data: %s", model_file_path_str.c_str()); - return model_file_path_str; +std::string find_bin_file_in_folder(const std::string &model_local_folder_path) +{ + // find .bin file in folder + for (const auto &entry : std::filesystem::directory_iterator(model_local_folder_path)) { + if (entry.path().extension() == ".bin") { + const std::string bin_file_path = entry.path().string(); + obs_log(LOG_INFO, "Model bin file found in folder: %s", + bin_file_path.c_str()); + return bin_file_path; } } + obs_log(LOG_ERROR, "Model bin file not found in folder: %s", + model_local_folder_path.c_str()); + return ""; +} + +std::string find_model_folder(const ModelInfo &model_info) +{ + char *data_folder_models = obs_module_file("models"); + const std::filesystem::path module_data_models_folder = + std::filesystem::absolute(data_folder_models); + bfree(data_folder_models); + + const std::string model_local_data_path = + (module_data_models_folder / model_info.local_folder_name).string(); + + obs_log(LOG_INFO, "Checking if model '%s' exists in data...", + model_info.friendly_name.c_str()); + + if (!std::filesystem::exists(model_local_data_path)) { + obs_log(LOG_INFO, "Model not found in data: %s", model_local_data_path.c_str()); + } else { + obs_log(LOG_INFO, "Model folder found in data: %s", model_local_data_path.c_str()); + return model_local_data_path; + } // Check if model exists in the config folder - char *model_config_path_str = - obs_module_get_config_path(obs_current_module(), model_name_cstr); - std::string model_config_path(model_config_path_str); - bfree(model_config_path_str); - obs_log(LOG_INFO, "Model path in config: %s", model_config_path.c_str()); - if (std::filesystem::exists(model_config_path)) { - obs_log(LOG_INFO, "Model exists in config folder: %s", model_config_path.c_str()); - return model_config_path; + char *config_folder = obs_module_get_config_path(obs_current_module(), "models"); + const std::filesystem::path module_config_models_folder = + std::filesystem::absolute(config_folder); + bfree(config_folder); + + obs_log(LOG_INFO, "Checking if model '%s' exists in config...", + model_info.friendly_name.c_str()); + + const std::string model_local_config_path = + (module_config_models_folder / model_info.local_folder_name).string(); + + obs_log(LOG_INFO, "Model path in config: %s", model_local_config_path.c_str()); + if (std::filesystem::exists(model_local_config_path)) { + obs_log(LOG_INFO, "Model exists in config folder: %s", + model_local_config_path.c_str()); + return model_local_config_path; } - obs_log(LOG_INFO, "Model %s not found.", model_name_cstr); + obs_log(LOG_INFO, "Model '%s' not found.", model_info.friendly_name.c_str()); return ""; } -void download_model_with_ui_dialog(const std::string &model_name, +std::string find_model_bin_file(const ModelInfo &model_info) +{ + const std::string model_local_folder_path = find_model_folder(model_info); + if (model_local_folder_path.empty()) { + return ""; + } + + return find_bin_file_in_folder(model_local_folder_path); +} + +void download_model_with_ui_dialog(const ModelInfo &model_info, download_finished_callback_t download_finished_callback) { // Start the model downloader UI ModelDownloader *model_downloader = new ModelDownloader( - model_name, download_finished_callback, (QWidget *)obs_frontend_get_main_window()); + model_info, download_finished_callback, (QWidget *)obs_frontend_get_main_window()); model_downloader->show(); } diff --git a/src/model-utils/model-downloader.h b/src/model-utils/model-downloader.h index 09d07ab..b075d39 100644 --- a/src/model-utils/model-downloader.h +++ b/src/model-utils/model-downloader.h @@ -6,10 +6,14 @@ #include "model-downloader-types.h" -std::string find_model_file(const std::string &model_name); +std::string find_file_in_folder_by_name(const std::string &folder_path, + const std::string &file_name); +std::string find_bin_file_in_folder(const std::string &path); +std::string find_model_folder(const ModelInfo &model_info); +std::string find_model_bin_file(const ModelInfo &model_info); // Start the model downloader UI dialog with a callback for when the download is finished -void download_model_with_ui_dialog(const std::string &model_name, +void download_model_with_ui_dialog(const ModelInfo &model_info, download_finished_callback_t download_finished_callback); #endif // MODEL_DOWNLOADER_H diff --git a/src/model-utils/model-infos.cpp b/src/model-utils/model-infos.cpp new file mode 100644 index 0000000..cd00814 --- /dev/null +++ b/src/model-utils/model-infos.cpp @@ -0,0 +1,122 @@ +#include "model-downloader-types.h" + +std::map models_info = {{ + {"M2M-100 418M (495Mb)", + {"M2M-100 418M", + "m2m-100-418M", + MODEL_TYPE_TRANSLATION, + {{"https://huggingface.co/jncraton/m2m100_418M-ct2-int8/resolve/main/model.bin?download=true", + "D6703DD9F920FF896E45C3D97B490761BED5944937B90BBE6A7245F5652542D4"}, + { + "https://huggingface.co/jncraton/m2m100_418M-ct2-int8/resolve/main/config.json?download=true", + "4244772990E30069563E3DDFB4AD6DC95BDFD2AC3DE667EA8858C9B0A8433FA8", + }, + {"https://huggingface.co/jncraton/m2m100_418M-ct2-int8/resolve/main/generation_config.json?download=true", + "AED76366507333DDBB8BD49960F23C82FE6446B3319A46A54BEFDB45324CCF61"}, + {"https://huggingface.co/jncraton/m2m100_418M-ct2-int8/resolve/main/shared_vocabulary.json?download=true", + "7EB5D0FF184C6095C7C10F9911C0AEA492250ABD12854F9C3D787C64B1C6397E"}, + {"https://huggingface.co/jncraton/m2m100_418M-ct2-int8/resolve/main/special_tokens_map.json?download=true", + "C1A4F86C3874D279AE1B2A05162858DB5DD6C61665D84223ED886CBCFF08FDA6"}, + {"https://huggingface.co/jncraton/m2m100_418M-ct2-int8/resolve/main/tokenizer_config.json?download=true", + "AE54F15F0649BB05041CDADAD8485BA1FAF40BC33E6B4C2A74AE2D1AE5710FA2"}, + {"https://huggingface.co/jncraton/m2m100_418M-ct2-int8/resolve/main/vocab.json?download=true", + "B6E77E474AEEA8F441363ACA7614317C06381F3EACFE10FB9856D5081D1074CC"}, + {"https://huggingface.co/jncraton/m2m100_418M-ct2-int8/resolve/main/sentencepiece.bpe.model?download=true", + "D8F7C76ED2A5E0822BE39F0A4F95A55EB19C78F4593CE609E2EDBC2AEA4D380A"}}}}, + {"Whisper Base q5 (57Mb)", + {"Whisper Base q5", + "whisper-base-q5", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-base-q5_1.bin", + "422F1AE452ADE6F30A004D7E5C6A43195E4433BC370BF23FAC9CC591F01A8898"}}}}, + {"Whisper Base En q5 (57Mb)", + {"Whisper Base En q5", + "ggml-model-whisper-base-en-q5_1", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin", + "4BAF70DD0D7C4247BA2B81FAFD9C01005AC77C2F9EF064E00DCF195D0E2FDD2F"}}}}, + {"Whisper Base (141Mb)", + {"Whisper Base", + "ggml-model-whisper-base", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-base.bin", + "60ED5BC3DD14EEA856493D334349B405782DDCAF0028D4B5DF4088345FBA2EFE"}}}}, + {"Whisper Base En (141Mb)", + {"Whisper Base En", + "ggml-model-whisper-base-en", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-base.en.bin", + "A03779C86DF3323075F5E796CB2CE5029F00EC8869EEE3FDFB897AFE36C6D002"}}}}, + {"Whisper Large q5 (1Gb)", + {"Whisper Large q5", + "ggml-model-whisper-large-q5_0", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-large-q5_0.bin", + "3A214837221E4530DBC1FE8D734F302AF393EB30BD0ED046042EBF4BAF70F6F2"}}}}, + {"Whisper Medium q5 (514Mb)", + {"Whisper Medium q5", + "ggml-model-whisper-medium-q5_0", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-medium-q5_0.bin", + "19FEA4B380C3A618EC4723C3EEF2EB785FFBA0D0538CF43F8F235E7B3B34220F"}}}}, + {"Whisper Medium En q5 (514Mb)", + {"Whisper Medium En q5", + "ggml-model-whisper-medium-en-q5_0", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin", + "76733E26AD8FE1C7A5BF7531A9D41917B2ADC0F20F2E4F5531688A8C6CD88EB0"}}}}, + {"Whisper Small q5 (181Mb)", + {"Whisper Small q5", + "ggml-model-whisper-small-q5_1", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-small-q5_1.bin", + "AE85E4A935D7A567BD102FE55AFC16BB595BDB618E11B2FC7591BC08120411BB"}}}}, + {"Whisper Small En q5 (181Mb)", + {"Whisper Small En q5", + "ggml-model-whisper-small-en-q5_1", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin", + "BFDFF4894DCB76BBF647D56263EA2A96645423F1669176F4844A1BF8E478AD30"}}}}, + {"Whisper Small (465Mb)", + {"Whisper Small", + "ggml-model-whisper-small", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-small.bin", + "1BE3A9B2063867B937E64E2EC7483364A79917E157FA98C5D94B5C1FFFEA987B"}}}}, + {"Whisper Small En (465Mb)", + {"Whisper Small En", + "ggml-model-whisper-small-en", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-small.en.bin", + "C6138D6D58ECC8322097E0F987C32F1BE8BB0A18532A3F88F734D1BBF9C41E5D"}}}}, + {"Whisper Tiny (74Mb)", + {"Whisper Tiny", + "ggml-model-whisper-tiny", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny.bin", + "BE07E048E1E599AD46341C8D2A135645097A538221678B7ACDD1B1919C6E1B21"}}}}, + {"Whisper Tiny q5 (31Mb)", + {"Whisper Tiny q5", + "ggml-model-whisper-tiny-q5_1", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin", + "818710568DA3CA15689E31A743197B520007872FF9576237BDA97BD1B469C3D7"}}}}, + {"Whisper Tiny En q5 (31Mb)", + {"Whisper Tiny En q5", + "ggml-model-whisper-tiny-en-q5_1", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin", + "C77C5766F1CEF09B6B7D47F21B546CBDDD4157886B3B5D6D4F709E91E66C7C2B"}}}}, + {"Whisper Tiny En q8 (42Mb)", + {"Whisper Tiny En q8", + "ggml-model-whisper-tiny-en-q8_0", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q8_0.bin", + "5BC2B3860AA151A4C6E7BB095E1FCCE7CF12C7B020CA08DCEC0C6D018BB7DD94"}}}}, + {"Whisper Tiny En (74Mb)", + {"Whisper Tiny En", + "ggml-model-whisper-tiny-en", + MODEL_TYPE_TRANSCRIPTION, + {{"https://ggml.ggerganov.com/ggml-model-whisper-tiny.en.bin", + "921E4CF8686FDD993DCD081A5DA5B6C365BFDE1162E72B08D75AC75289920B1F"}}}}, +}}; diff --git a/src/transcription-filter-data.h b/src/transcription-filter-data.h index f370765..90621fb 100644 --- a/src/transcription-filter-data.h +++ b/src/transcription-filter-data.h @@ -15,6 +15,8 @@ #include #include +#include "translation/translation.h" + #define MAX_PREPROC_CHANNELS 10 #define MT_ obs_module_text @@ -63,7 +65,7 @@ struct transcription_filter_data { audio_resampler_t *resampler; /* whisper */ - char *whisper_model_path; + std::string whisper_model_path; struct whisper_context *whisper_context; whisper_full_params whisper_params; @@ -80,6 +82,12 @@ struct transcription_filter_data { bool save_only_while_recording = false; bool process_while_muted = false; bool rename_file_to_match_recording = false; + bool translate = false; + std::string source_lang; + std::string target_lang; + + // Last transcription result + std::string last_text; // Text source to output the subtitles obs_weak_source_t *text_source; @@ -98,6 +106,9 @@ struct transcription_filter_data { std::mutex *whisper_ctx_mutex; std::condition_variable *wshiper_thread_cv; + // translation context + struct translation_context translation_ctx; + // ctor transcription_filter_data() { @@ -107,7 +118,7 @@ struct transcription_filter_data { } context = nullptr; resampler = nullptr; - whisper_model_path = nullptr; + whisper_model_path = ""; whisper_context = nullptr; text_source = nullptr; text_source_mutex = nullptr; diff --git a/src/transcription-filter.cpp b/src/transcription-filter.cpp index 42ccf5d..b3df64f 100644 --- a/src/transcription-filter.cpp +++ b/src/transcription-filter.cpp @@ -8,6 +8,8 @@ #include "whisper-utils/whisper-processing.h" #include "whisper-utils/whisper-language.h" #include "whisper-utils/whisper-utils.h" +#include "translation/language_codes.h" +#include "translation/translation.h" #include #include @@ -126,7 +128,7 @@ void transcription_filter_destroy(void *data) struct transcription_filter_data *gf = static_cast(data); - obs_log(gf->log_level, "transcription_filter_destroy"); + obs_log(gf->log_level, "filter destroy"); shutdown_whisper_thread(gf); if (gf->text_source_name) { @@ -274,6 +276,32 @@ void set_text_callback(struct transcription_filter_data *gf, std::string str_copy = result.text; #endif + // remove trailing spaces, newlines, tabs or punctuation + str_copy.erase(std::find_if(str_copy.rbegin(), str_copy.rend(), + [](unsigned char ch) { + return !std::isspace(ch) || !std::ispunct(ch); + }) + .base(), + str_copy.end()); + + if (gf->translate) { + obs_log(gf->log_level, "Translating text. %s -> %s", gf->source_lang.c_str(), + gf->target_lang.c_str()); + std::string translated_text; + if (translate(gf->translation_ctx, str_copy, gf->source_lang, gf->target_lang, + translated_text) == OBS_POLYGLOT_TRANSLATION_SUCCESS) { + if (gf->log_words) { + obs_log(LOG_INFO, "Translation: '%s' -> '%s'", str_copy.c_str(), + translated_text.c_str()); + } + str_copy = translated_text; + } else { + obs_log(gf->log_level, "Failed to translate text"); + } + } + + gf->last_text = str_copy; + if (gf->caption_to_stream) { obs_output_t *streaming_output = obs_frontend_get_streaming_output(); if (streaming_output) { @@ -366,7 +394,7 @@ void transcription_filter_update(void *data, obs_data_t *s) static_cast(data); gf->log_level = (int)obs_data_get_int(s, "log_level"); - obs_log(gf->log_level, "transcription_filter_update"); + obs_log(gf->log_level, "filter update"); gf->vad_enabled = obs_data_get_bool(s, "vad_enabled"); gf->log_words = obs_data_get_bool(s, "log_words"); @@ -387,7 +415,20 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->min_sub_duration = (int)obs_data_get_int(s, "min_sub_duration"); gf->last_sub_render_time = 0; - obs_log(gf->log_level, "transcription_filter: update text source"); + bool new_translate = obs_data_get_bool(s, "translate"); + gf->source_lang = obs_data_get_string(s, "translate_source_language"); + gf->target_lang = obs_data_get_string(s, "translate_target_language"); + gf->translation_ctx.add_context = obs_data_get_bool(s, "translate_add_context"); + + if (new_translate != gf->translate) { + if (new_translate) { + start_translation(gf); + } else { + gf->translate = false; + } + } + + obs_log(gf->log_level, "update text source"); // update the text source const char *new_text_source_name = obs_data_get_string(s, "subtitle_sources"); obs_weak_source_t *old_weak_text_source = NULL; @@ -451,10 +492,10 @@ void transcription_filter_update(void *data, obs_data_t *s) return; } - obs_log(gf->log_level, "transcription_filter: update whisper model"); + obs_log(gf->log_level, "update whisper model"); update_whsiper_model_path(gf, s); - obs_log(gf->log_level, "transcription_filter: update whisper params"); + obs_log(gf->log_level, "update whisper params"); std::lock_guard lock(*gf->whisper_ctx_mutex); gf->whisper_params = whisper_full_default_params( @@ -464,7 +505,7 @@ void transcription_filter_update(void *data, obs_data_t *s) gf->whisper_params.initial_prompt = obs_data_get_string(s, "initial_prompt"); gf->whisper_params.n_threads = (int)obs_data_get_int(s, "n_threads"); gf->whisper_params.n_max_text_ctx = (int)obs_data_get_int(s, "n_max_text_ctx"); - gf->whisper_params.translate = obs_data_get_bool(s, "translate"); + gf->whisper_params.translate = obs_data_get_bool(s, "whisper_translate"); gf->whisper_params.no_context = obs_data_get_bool(s, "no_context"); gf->whisper_params.single_segment = obs_data_get_bool(s, "single_segment"); gf->whisper_params.print_special = obs_data_get_bool(s, "print_special"); @@ -488,7 +529,7 @@ void transcription_filter_update(void *data, obs_data_t *s) void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) { - obs_log(LOG_INFO, "transcription filter create"); + obs_log(LOG_INFO, "LocalVocal filter create"); struct transcription_filter_data *gf = new transcription_filter_data(); @@ -528,10 +569,10 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->overlap_ms = (int)obs_data_get_int(settings, "overlap_size_msec"); gf->overlap_frames = (size_t)((float)gf->sample_rate / (1000.0f / (float)gf->overlap_ms)); - obs_log(gf->log_level, "transcription_filter: channels %d, frames %d, sample_rate %d", - (int)gf->channels, (int)gf->frames, gf->sample_rate); + obs_log(gf->log_level, "channels %d, frames %d, sample_rate %d", (int)gf->channels, + (int)gf->frames, gf->sample_rate); - obs_log(gf->log_level, "transcription_filter: setup audio resampler"); + obs_log(gf->log_level, "setup audio resampler"); struct resample_info src, dst; src.samples_per_sec = gf->sample_rate; src.format = AUDIO_FORMAT_FLOAT_PLANAR; @@ -543,12 +584,12 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) gf->resampler = audio_resampler_create(&dst, &src); - obs_log(gf->log_level, "transcription_filter: setup mutexes and condition variables"); + obs_log(gf->log_level, "setup mutexes and condition variables"); gf->whisper_buf_mutex = new std::mutex(); gf->whisper_ctx_mutex = new std::mutex(); gf->wshiper_thread_cv = new std::condition_variable(); gf->text_source_mutex = new std::mutex(); - obs_log(gf->log_level, "transcription_filter: clear text source data"); + obs_log(gf->log_level, "clear text source data"); gf->text_source = nullptr; const char *subtitle_sources = obs_data_get_string(settings, "subtitle_sources"); if (subtitle_sources != nullptr) { @@ -556,13 +597,13 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) } else { gf->text_source_name = nullptr; } - obs_log(gf->log_level, "transcription_filter: clear paths and whisper context"); + obs_log(gf->log_level, "clear paths and whisper context"); gf->whisper_model_file_currently_loaded = ""; gf->output_file_path = std::string(""); - gf->whisper_model_path = nullptr; // The update function will set the model path + gf->whisper_model_path = std::string(""); // The update function will set the model path gf->whisper_context = nullptr; - obs_log(gf->log_level, "transcription_filter: run update"); + obs_log(gf->log_level, "run update"); // get the settings updated on the filter data struct transcription_filter_update(gf, settings); @@ -610,7 +651,7 @@ void *transcription_filter_create(obs_data_t *settings, obs_source_t *filter) }, gf); - obs_log(gf->log_level, "transcription_filter: filter created."); + obs_log(gf->log_level, "filter created."); return gf; } @@ -634,7 +675,7 @@ void transcription_filter_activate(void *data) { struct transcription_filter_data *gf = static_cast(data); - obs_log(gf->log_level, "transcription_filter filter activated"); + obs_log(gf->log_level, "filter activated"); gf->active = true; } @@ -642,20 +683,19 @@ void transcription_filter_deactivate(void *data) { struct transcription_filter_data *gf = static_cast(data); - obs_log(gf->log_level, "transcription_filter filter deactivated"); + obs_log(gf->log_level, "filter deactivated"); gf->active = false; } void transcription_filter_defaults(obs_data_t *s) { - obs_log(LOG_INFO, "transcription_filter_defaults"); + obs_log(LOG_INFO, "filter defaults"); obs_data_set_default_bool(s, "vad_enabled", true); obs_data_set_default_int(s, "log_level", LOG_DEBUG); obs_data_set_default_bool(s, "log_words", true); obs_data_set_default_bool(s, "caption_to_stream", false); - obs_data_set_default_string(s, "whisper_model_path", - "models/ggml-model-whisper-tiny.en.bin"); + obs_data_set_default_string(s, "whisper_model_path", "Whisper Tiny En (74Mb)"); obs_data_set_default_string(s, "whisper_language_select", "en"); obs_data_set_default_string(s, "subtitle_sources", "none"); obs_data_set_default_bool(s, "step_by_step_processing", false); @@ -669,14 +709,18 @@ void transcription_filter_defaults(obs_data_t *s) obs_data_set_default_int(s, "step_size_msec", 1000); obs_data_set_default_int(s, "min_sub_duration", 3000); obs_data_set_default_bool(s, "advanced_settings", false); + obs_data_set_default_bool(s, "translate", false); + obs_data_set_default_string(s, "translate_target_language", "__es__"); + obs_data_set_default_string(s, "translate_source_language", "__en__"); + obs_data_set_default_bool(s, "translate_add_context", true); // Whisper parameters obs_data_set_default_int(s, "whisper_sampling_method", WHISPER_SAMPLING_BEAM_SEARCH); obs_data_set_default_string(s, "initial_prompt", ""); obs_data_set_default_int(s, "n_threads", 4); obs_data_set_default_int(s, "n_max_text_ctx", 16384); - obs_data_set_default_bool(s, "translate", false); - obs_data_set_default_bool(s, "no_context", true); + obs_data_set_default_bool(s, "whisper_translate", false); + obs_data_set_default_bool(s, "no_context", false); obs_data_set_default_bool(s, "single_segment", true); obs_data_set_default_bool(s, "print_special", false); obs_data_set_default_bool(s, "print_progress", false); @@ -698,7 +742,7 @@ void transcription_filter_defaults(obs_data_t *s) obs_properties_t *transcription_filter_properties(void *data) { - obs_log(LOG_INFO, "transcription_filter_properties"); + obs_log(LOG_DEBUG, "Add filter properties"); struct transcription_filter_data *gf = static_cast(data); @@ -730,6 +774,43 @@ obs_properties_t *transcription_filter_properties(void *data) return true; }); + // add translation option group + obs_properties_t *translation_group = obs_properties_create(); + obs_property_t *translation_group_prop = obs_properties_add_group( + ppts, "translate", MT_("translate"), OBS_GROUP_CHECKABLE, translation_group); + // add target language selection + obs_property_t *prop_tgt = obs_properties_add_list( + translation_group, "translate_target_language", MT_("target_language"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + obs_property_t *prop_src = obs_properties_add_list( + translation_group, "translate_source_language", MT_("source_language"), + OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + obs_properties_add_bool(translation_group, "translate_add_context", + MT_("translate_add_context")); + + // Populate the dropdown with the language codes + for (const auto &language : language_codes) { + obs_property_list_add_string(prop_tgt, language.second.c_str(), + language.first.c_str()); + obs_property_list_add_string(prop_src, language.second.c_str(), + language.first.c_str()); + } + + // add callback to enable/disable translation group + obs_property_set_modified_callback(translation_group_prop, [](obs_properties_t *props, + obs_property_t *property, + obs_data_t *settings) { + UNUSED_PARAMETER(property); + // Show/Hide the translation group + const bool translate_enabled = obs_data_get_bool(settings, "translate"); + for (const auto &prop : {"translate_target_language", "translate_source_language", + "translate_add_context"}) { + obs_property_set_visible(obs_properties_get(props, prop), + translate_enabled); + } + return true; + }); + obs_properties_add_bool(ppts, "process_while_muted", MT_("process_while_muted")); obs_property_t *subs_output = obs_properties_add_list(ppts, "subtitle_sources", MT_("subtitle_sources"), @@ -754,39 +835,14 @@ obs_properties_t *transcription_filter_properties(void *data) obs_property_t *whisper_models_list = obs_properties_add_list(ppts, "whisper_model_path", MT_("whisper_model"), OBS_COMBO_TYPE_LIST, OBS_COMBO_FORMAT_STRING); + // Add models from models_info map + for (const auto &model_info : models_info) { + if (model_info.second.type == MODEL_TYPE_TRANSCRIPTION) { + obs_property_list_add_string(whisper_models_list, model_info.first.c_str(), + model_info.first.c_str()); + } + } - obs_property_list_add_string(whisper_models_list, "Base q5 57M", - "models/ggml-model-whisper-base-q5_1.bin"); - obs_property_list_add_string(whisper_models_list, "Base 141M", - "models/ggml-model-whisper-base.bin"); - obs_property_list_add_string(whisper_models_list, "Base (Eng) q5 57M", - "models/ggml-model-whisper-base.en-q5_1.bin"); - obs_property_list_add_string(whisper_models_list, "Base (Eng) 141M", - "models/ggml-model-whisper-base.en.bin"); - obs_property_list_add_string(whisper_models_list, "Large q5 1G", - "models/ggml-model-whisper-large-q5_0.bin"); - obs_property_list_add_string(whisper_models_list, "Medium q5 514M", - "models/ggml-model-whisper-medium-q5_0.bin"); - obs_property_list_add_string(whisper_models_list, "Medium (Eng) 514M", - "models/ggml-model-whisper-medium.en-q5_0.bin"); - obs_property_list_add_string(whisper_models_list, "Small q5 181M", - "models/ggml-model-whisper-small-q5_1.bin"); - obs_property_list_add_string(whisper_models_list, "Small 465M", - "models/ggml-model-whisper-small.bin"); - obs_property_list_add_string(whisper_models_list, "Small (Eng) q5 181M", - "models/ggml-model-whisper-small.en-q5_1.bin"); - obs_property_list_add_string(whisper_models_list, "Small (Eng) 465M", - "models/ggml-model-whisper-small.en.bin"); - obs_property_list_add_string(whisper_models_list, "Tiny q5 31M", - "models/ggml-model-whisper-tiny-q5_1.bin"); - obs_property_list_add_string(whisper_models_list, "Tiny 74M", - "models/ggml-model-whisper-tiny.bin"); - obs_property_list_add_string(whisper_models_list, "Tiny (Eng) q5 31M", - "models/ggml-model-whisper-tiny.en-q5_1.bin"); - obs_property_list_add_string(whisper_models_list, "Tiny (Eng) q8 42M", - "models/ggml-model-whisper-tiny.en-q8_0.bin"); - obs_property_list_add_string(whisper_models_list, "Tiny (Eng) 74M", - "models/ggml-model-whisper-tiny.en.bin"); obs_property_list_add_string(whisper_models_list, "Load external model file", "!!!external!!!"); @@ -888,7 +944,7 @@ obs_properties_t *transcription_filter_properties(void *data) // int offset_ms; // start offset in ms // int duration_ms; // audio duration to process in ms // bool translate; - obs_properties_add_bool(whisper_params_group, "translate", MT_("translate")); + obs_properties_add_bool(whisper_params_group, "whisper_translate", MT_("translate")); // bool no_context; // do not use past transcription (if any) as initial prompt for the decoder obs_properties_add_bool(whisper_params_group, "no_context", MT_("no_context")); // bool single_segment; // force single segment output (useful for streaming) diff --git a/src/translation/language_codes.h b/src/translation/language_codes.h new file mode 100644 index 0000000..7922446 --- /dev/null +++ b/src/translation/language_codes.h @@ -0,0 +1,205 @@ +#pragma once + +#include +#include + +std::map language_codes = {{"__af__", "Afrikaans"}, + {"__am__", "Amharic"}, + {"__ar__", "Arabic"}, + {"__ast__", "Asturian"}, + {"__az__", "Azerbai"}, + {"__ba__", "Bashkir"}, + {"__be__", "Belarusian"}, + {"__bg__", "Bulgarian"}, + {"__bn__", "Bengali"}, + {"__br__", "Breton"}, + {"__bs__", "Bosnian"}, + {"__ca__", "Catalan"}, + {"__ceb__", "Cebuano"}, + {"__cs__", "Czech"}, + {"__cy__", "Welsh"}, + {"__da__", "Danish"}, + {"__de__", "German"}, + {"__el__", "Greek"}, + {"__en__", "English"}, + {"__es__", "Spanish"}, + {"__et__", "Estonian"}, + {"__fa__", "Persian"}, + {"__ff__", "Fulah"}, + {"__fi__", "Finnish"}, + {"__fr__", "French"}, + {"__fy__", "Frisian"}, + {"__ga__", "Irish"}, + {"__gd__", "Scottish Gaelic"}, + {"__gl__", "Galician"}, + {"__gu__", "Gujarati"}, + {"__ha__", "Hausa"}, + {"__he__", "Hebrew"}, + {"__hi__", "Hindi"}, + {"__hr__", "Croatian"}, + {"__ht__", "Haitian Creole"}, + {"__hu__", "Hungarian"}, + {"__hy__", "Armenian"}, + {"__id__", "Indonesian"}, + {"__ig__", "Igbo"}, + {"__ilo__", "Ilokano"}, + {"__is__", "Icelandic"}, + {"__it__", "Italian"}, + {"__ja__", "Japanese"}, + {"__jv__", "Javanese"}, + {"__ka__", "Georgian"}, + {"__kk__", "Kazakh"}, + {"__km__", "Khmer"}, + {"__kn__", "Kannada"}, + {"__ko__", "Korean"}, + {"__lb__", "Luxembourgish"}, + {"__lg__", "Ganda"}, + {"__ln__", "Lingala"}, + {"__lo__", "Lao"}, + {"__lt__", "Lithuanian"}, + {"__lv__", "Latvian"}, + {"__mg__", "Malagasy"}, + {"__mk__", "Macedonian"}, + {"__ml__", "Malayalam"}, + {"__mn__", "Mongolian"}, + {"__mr__", "Marathi"}, + {"__ms__", "Malay"}, + {"__my__", "Burmese"}, + {"__ne__", "Nepali"}, + {"__nl__", "Dutch"}, + {"__no__", "Norwegian"}, + {"__ns__", "Northern Sotho"}, + {"__oc__", "Occitan"}, + {"__or__", "Oriya"}, + {"__pa__", "Punjabi"}, + {"__pl__", "Polish"}, + {"__ps__", "Pashto"}, + {"__pt__", "Portuguese"}, + {"__ro__", "Romanian"}, + {"__ru__", "Russian"}, + {"__sd__", "Sindhi"}, + {"__si__", "Sinhala"}, + {"__sk__", "Slovak"}, + {"__sl__", "Slovenian"}, + {"__so__", "Somali"}, + {"__sq__", "Albanian"}, + {"__sr__", "Serbian"}, + {"__ss__", "Swati"}, + {"__su__", "Sundanese"}, + {"__sv__", "Swedish"}, + {"__sw__", "Swahili"}, + {"__ta__", "Tamil"}, + {"__th__", "Thai"}, + {"__tl__", "Tagalog"}, + {"__tn__", "Tswana"}, + {"__tr__", "Turkish"}, + {"__uk__", "Ukrainian"}, + {"__ur__", "Urdu"}, + {"__uz__", "Uzbek"}, + {"__vi__", "Vietnamese"}, + {"__wo__", "Wolof"}, + {"__xh__", "Xhosa"}, + {"__yi__", "Yiddish"}, + {"__yo__", "Yoruba"}, + {"__zh__", "Chinese"}, + {"__zu__", "Zulu"}}; + +std::map language_codes_reverse = {{"Afrikaans", "__af__"}, + {"Amharic", "__am__"}, + {"Arabic", "__ar__"}, + {"Asturian", "__ast__"}, + {"Azerbai", "__az__"}, + {"Bashkir", "__ba__"}, + {"Belarusian", "__be__"}, + {"Bengali", "__bn__"}, + {"Breton", "__br__"}, + {"Bosnian", "__bs__"}, + {"Catalan", "__ca__"}, + {"Cebuano", "__ceb__"}, + {"Czech", "__cs__"}, + {"Welsh", "__cy__"}, + {"Danish", "__da__"}, + {"German", "__de__"}, + {"Greek", "__el__"}, + {"English", "__en__"}, + {"Spanish", "__es__"}, + {"Estonian", "__et__"}, + {"Persian", "__fa__"}, + {"Fulah", "__ff__"}, + {"Finnish", "__fi__"}, + {"French", "__fr__"}, + {"Frisian", "__fy__"}, + {"Irish", "__ga__"}, + {"Scottish Gaelic", "__gd__"}, + {"Galician", "__gl__"}, + {"Gujarati", "__gu__"}, + {"Hausa", "__ha__"}, + {"Hebrew", "__he__"}, + {"Hindi", "__hi__"}, + {"Croatian", "__hr__"}, + {"Haitian Creole", "__ht__"}, + {"Hungarian", "__hu__"}, + {"Armenian", "__hy__"}, + {"Indonesian", "__id__"}, + {"Igbo", "__ig__"}, + {"Ilokano", "__ilo__"}, + {"Icelandic", "__is__"}, + {"Italian", "__it__"}, + {"Japanese", "__ja__"}, + {"Javanese", "__jv__"}, + {"Georgian", "__ka__"}, + {"Kazakh", "__kk__"}, + {"Khmer", "__km__"}, + {"Kannada", "__kn__"}, + {"Korean", "__ko__"}, + {"Luxembourgish", "__lb__"}, + {"Ganda", "__lg__"}, + {"Lingala", "__ln__"}, + {"Lao", "__lo__"}, + {"Lithuanian", "__lt__"}, + {"Latvian", "__lv__"}, + {"Malagasy", "__mg__"}, + {"Macedonian", "__mk__"}, + {"Malayalam", "__ml__"}, + {"Mongolian", "__mn__"}, + {"Marathi", "__mr__"}, + {"Malay", "__ms__"}, + {"Burmese", "__my__"}, + {"Nepali", "__ne__"}, + {"Dutch", "__nl__"}, + {"Norwegian", "__no__"}, + {"Northern Sotho", "__ns__"}, + {"Occitan", "__oc__"}, + {"Oriya", "__or__"}, + {"Punjabi", "__pa__"}, + {"Polish", "__pl__"}, + {"Pashto", "__ps__"}, + {"Portuguese", "__pt__"}, + {"Romanian", "__ro__"}, + {"Russian", "__ru__"}, + {"Sindhi", "__sd__"}, + {"Sinhala", "__si__"}, + {"Slovak", "__sk__"}, + {"Slovenian", "__sl__"}, + {"Somali", "__so__"}, + {"Albanian", "__sq__"}, + {"Serbian", "__sr__"}, + {"Swati", "__ss__"}, + {"Sundanese", "__su__"}, + {"Swedish", "__sv__"}, + {"Swahili", "__sw__"}, + {"Tamil", "__ta__"}, + {"Thai", "__th__"}, + {"Tagalog", "__tl__"}, + {"Tswana", "__tn__"}, + {"Turkish", "__tr__"}, + {"Ukrainian", "__uk__"}, + {"Urdu", "__ur__"}, + {"Uzbek", "__uz__"}, + {"Vietnamese", "__vi__"}, + {"Wolof", "__wo__"}, + {"Xhosa", "__xh__"}, + {"Yiddish", "__yi__"}, + {"Yoruba", "__yo__"}, + {"Chinese", "__zh__"}, + {"Zulu", "__zu__"}}; diff --git a/src/translation/translation.cpp b/src/translation/translation.cpp new file mode 100644 index 0000000..95e58ae --- /dev/null +++ b/src/translation/translation.cpp @@ -0,0 +1,154 @@ +#include "translation.h" +#include "plugin-support.h" +#include "model-utils/model-downloader.h" +#include "transcription-filter-data.h" + +#include +#include +#include +#include + +void build_and_enable_translation(struct transcription_filter_data *gf, + const std::string &model_file_path) +{ + gf->translation_ctx.local_model_folder_path = model_file_path; + if (build_translation_context(gf->translation_ctx) == + OBS_POLYGLOT_TRANSLATION_INIT_SUCCESS) { + obs_log(LOG_INFO, "Enable translation"); + gf->translate = true; + } else { + obs_log(LOG_ERROR, "Failed to load CT2 model"); + gf->translate = false; + } +} + +void start_translation(struct transcription_filter_data *gf) +{ + obs_log(LOG_INFO, "Starting translation..."); + + const ModelInfo &translation_model_info = models_info["M2M-100 418M (495Mb)"]; + std::string model_file_found = find_model_folder(translation_model_info); + if (model_file_found == "") { + obs_log(LOG_INFO, "Translation CT2 model does not exist. Downloading..."); + download_model_with_ui_dialog( + translation_model_info, + [gf, model_file_found](int download_status, const std::string &path) { + if (download_status == 0) { + obs_log(LOG_INFO, "CT2 model download complete"); + build_and_enable_translation(gf, path); + } else { + obs_log(LOG_ERROR, "Model download failed"); + gf->translate = false; + } + }); + } else { + // Model exists, just load it + build_and_enable_translation(gf, model_file_found); + } +} + +int build_translation_context(struct translation_context &translation_ctx) +{ + std::string local_model_path = translation_ctx.local_model_folder_path; + obs_log(LOG_INFO, "Building translation context from '%s'...", local_model_path.c_str()); + // find the SPM file in the model folder + std::string local_spm_path = + find_file_in_folder_by_name(local_model_path, "sentencepiece.bpe.model"); + + try { + obs_log(LOG_INFO, "Loading SPM from %s", local_spm_path.c_str()); + translation_ctx.processor.reset(new sentencepiece::SentencePieceProcessor()); + const auto status = translation_ctx.processor->Load(local_spm_path); + if (!status.ok()) { + obs_log(LOG_ERROR, "Failed to load SPM: %s", status.ToString().c_str()); + return OBS_POLYGLOT_TRANSLATION_INIT_FAIL; + } + + translation_ctx.tokenizer = [&translation_ctx](const std::string &text) { + std::vector tokens; + translation_ctx.processor->Encode(text, &tokens); + return tokens; + }; + translation_ctx.detokenizer = + [&translation_ctx](const std::vector &tokens) { + std::string text; + translation_ctx.processor->Decode(tokens, &text); + return std::regex_replace(text, std::regex(""), "UNK"); + }; + + obs_log(LOG_INFO, "Loading CT2 model from %s", local_model_path.c_str()); + +#ifdef POLYGLOT_WITH_CUDA + ctranslate2::Device device = ctranslate2::Device::CUDA; + obs_log(LOG_INFO, "CT2 Using CUDA"); +#else + ctranslate2::Device device = ctranslate2::Device::CPU; + obs_log(LOG_INFO, "CT2 Using CPU"); +#endif + + translation_ctx.translator.reset(new ctranslate2::Translator( + local_model_path, device, ctranslate2::ComputeType::AUTO)); + obs_log(LOG_INFO, "CT2 Model loaded"); + + translation_ctx.options.reset(new ctranslate2::TranslationOptions); + translation_ctx.options->beam_size = 1; + translation_ctx.options->max_decoding_length = 40; + translation_ctx.options->use_vmap = true; + translation_ctx.options->return_scores = false; + translation_ctx.options->repetition_penalty = 1.1f; + translation_ctx.options->no_repeat_ngram_size = 2; + } catch (std::exception &e) { + obs_log(LOG_ERROR, "Failed to load CT2 model: %s", e.what()); + return OBS_POLYGLOT_TRANSLATION_INIT_FAIL; + } + return OBS_POLYGLOT_TRANSLATION_INIT_SUCCESS; +} + +int translate(struct translation_context &translation_ctx, const std::string &text, + const std::string &source_lang, const std::string &target_lang, std::string &result) +{ + try { + // set input tokens + std::vector input_tokens = {source_lang, ""}; + if (translation_ctx.add_context && translation_ctx.last_input_tokens.size() > 0) { + input_tokens.insert(input_tokens.end(), + translation_ctx.last_input_tokens.begin(), + translation_ctx.last_input_tokens.end()); + } + std::vector new_input_tokens = translation_ctx.tokenizer(text); + input_tokens.insert(input_tokens.end(), new_input_tokens.begin(), + new_input_tokens.end()); + input_tokens.push_back(""); + + translation_ctx.last_input_tokens = new_input_tokens; + + const std::vector> batch = {input_tokens}; + + // get target prefix + std::vector target_prefix = {target_lang}; + if (translation_ctx.add_context && + translation_ctx.last_translation_tokens.size() > 0) { + target_prefix.insert(target_prefix.end(), + translation_ctx.last_translation_tokens.begin(), + translation_ctx.last_translation_tokens.end()); + } + + const std::vector> target_prefix_batch = {target_prefix}; + const std::vector results = + translation_ctx.translator->translate_batch(batch, target_prefix_batch, + *translation_ctx.options); + + const auto &tokens_result = results[0].output(); + // take the tokens from the target_prefix length to the end + std::vector translation_tokens( + tokens_result.begin() + target_prefix.size(), tokens_result.end()); + + translation_ctx.last_translation_tokens = translation_tokens; + // detokenize + result = translation_ctx.detokenizer(translation_tokens); + } catch (std::exception &e) { + obs_log(LOG_ERROR, "Error: %s", e.what()); + return OBS_POLYGLOT_TRANSLATION_FAIL; + } + return OBS_POLYGLOT_TRANSLATION_SUCCESS; +} diff --git a/src/translation/translation.h b/src/translation/translation.h new file mode 100644 index 0000000..d79fd9d --- /dev/null +++ b/src/translation/translation.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include +#include +#include + +struct translation_context { + std::string local_model_folder_path; + std::unique_ptr processor; + std::unique_ptr translator; + std::unique_ptr options; + std::function(const std::string &)> tokenizer; + std::function &)> detokenizer; + std::vector last_input_tokens; + std::vector last_translation_tokens; + // Use the last translation as context for the next translation + bool add_context; +}; + +void start_translation(struct transcription_filter_data *gf); +int build_translation_context(struct translation_context &translation_ctx); + +int translate(struct translation_context &translation_ctx, const std::string &text, + const std::string &source_lang, const std::string &target_lang, std::string &result); + +#define OBS_POLYGLOT_TRANSLATION_INIT_FAIL -1 +#define OBS_POLYGLOT_TRANSLATION_INIT_SUCCESS 0 +#define OBS_POLYGLOT_TRANSLATION_SUCCESS 0 +#define OBS_POLYGLOT_TRANSLATION_FAIL -1 diff --git a/src/whisper-utils/whisper-processing.cpp b/src/whisper-utils/whisper-processing.cpp index 748f52e..485f28a 100644 --- a/src/whisper-utils/whisper-processing.cpp +++ b/src/whisper-utils/whisper-processing.cpp @@ -14,6 +14,7 @@ #include #include #endif +#include "model-utils/model-downloader.h" #define VAD_THOLD 0.0001f #define FREQ_THOLD 100.0f @@ -108,10 +109,25 @@ bool vad_simple(float *pcmf32, size_t pcm32f_size, uint32_t sample_rate, float v return true; } -struct whisper_context *init_whisper_context(const std::string &model_path) +struct whisper_context *init_whisper_context(const std::string &model_path_in) { + std::string model_path = model_path_in; + obs_log(LOG_INFO, "Loading whisper model from %s", model_path.c_str()); + if (std::filesystem::is_directory(model_path)) { + obs_log(LOG_INFO, + "Model path is a directory, not a file, looking for .bin file in folder"); + // look for .bin file + const std::string model_bin_file = find_bin_file_in_folder(model_path); + if (model_bin_file.empty()) { + obs_log(LOG_ERROR, "Model bin file not found in folder: %s", + model_path.c_str()); + return nullptr; + } + model_path = model_bin_file; + } + struct whisper_context_params cparams = whisper_context_default_params(); #ifdef LOCALVOCAL_WITH_CUDA cparams.use_gpu = true; diff --git a/src/whisper-utils/whisper-utils.cpp b/src/whisper-utils/whisper-utils.cpp index 47b983c..57dc954 100644 --- a/src/whisper-utils/whisper-utils.cpp +++ b/src/whisper-utils/whisper-utils.cpp @@ -9,29 +9,36 @@ void update_whsiper_model_path(struct transcription_filter_data *gf, obs_data_t std::string new_model_path = obs_data_get_string(s, "whisper_model_path"); const bool is_external_model = new_model_path.find("!!!external!!!") != std::string::npos; - if (gf->whisper_model_path == nullptr || - strcmp(new_model_path.c_str(), gf->whisper_model_path) != 0 || is_external_model) { + if (gf->whisper_model_path.empty() || gf->whisper_model_path != new_model_path || + is_external_model) { // model path changed, reload the model - obs_log(gf->log_level, "model path changed from %s to %s", gf->whisper_model_path, - new_model_path.c_str()); + obs_log(gf->log_level, "model path changed from %s to %s", + gf->whisper_model_path.c_str(), new_model_path.c_str()); // check if the new model is external file if (!is_external_model) { // new model is not external file shutdown_whisper_thread(gf); - gf->whisper_model_path = bstrdup(new_model_path.c_str()); + if (models_info.count(new_model_path) == 0) { + obs_log(LOG_WARNING, "Model '%s' does not exist", + new_model_path.c_str()); + return; + } + + const ModelInfo &model_info = models_info[new_model_path]; // check if the model exists, if not, download it - std::string model_file_found = find_model_file(gf->whisper_model_path); + std::string model_file_found = find_model_bin_file(model_info); if (model_file_found == "") { obs_log(LOG_WARNING, "Whisper model does not exist"); download_model_with_ui_dialog( - gf->whisper_model_path, - [gf](int download_status, const std::string &path) { + model_info, [gf, new_model_path](int download_status, + const std::string &path) { if (download_status == 0) { obs_log(LOG_INFO, "Model download complete"); + gf->whisper_model_path = new_model_path; start_whisper_thread_with_path(gf, path); } else { obs_log(LOG_ERROR, "Model download failed"); @@ -39,6 +46,7 @@ void update_whsiper_model_path(struct transcription_filter_data *gf, obs_data_t }); } else { // Model exists, just load it + gf->whisper_model_path = new_model_path; start_whisper_thread_with_path(gf, model_file_found); } } else { @@ -55,7 +63,7 @@ void update_whsiper_model_path(struct transcription_filter_data *gf, obs_data_t return; } else { shutdown_whisper_thread(gf); - gf->whisper_model_path = bstrdup(new_model_path.c_str()); + gf->whisper_model_path = new_model_path; start_whisper_thread_with_path(gf, external_model_file_path); } @@ -63,8 +71,8 @@ void update_whsiper_model_path(struct transcription_filter_data *gf, obs_data_t } } else { // model path did not change - obs_log(LOG_DEBUG, "model path did not change: %s == %s", gf->whisper_model_path, - new_model_path.c_str()); + obs_log(LOG_DEBUG, "model path did not change: %s == %s", + gf->whisper_model_path.c_str(), new_model_path.c_str()); } } @@ -85,9 +93,8 @@ void shutdown_whisper_thread(struct transcription_filter_data *gf) if (gf->whisper_thread.joinable()) { gf->whisper_thread.join(); } - if (gf->whisper_model_path != nullptr) { - bfree(gf->whisper_model_path); - gf->whisper_model_path = nullptr; + if (!gf->whisper_model_path.empty()) { + gf->whisper_model_path = ""; } }