Merge pull request #122 from chenqianhe/main

feat: 发布 Pypi Package
mli · Mar 10, 2024 · ec4b32d · ec4b32d
2 parents 9c9fdf2 + 101bbc3
commit ec4b32d
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,18 @@
 
 AutoCut 对你的视频自动生成字幕。然后你选择需要保留的句子，AutoCut 将对你视频中对应的片段裁切并保存。你无需使用视频编辑软件，只需要编辑文本文件即可完成剪切。
 
+**2024.03.10更新**：支持 pip 安装和提供 import 转录相关的功能
+
+```shell
+# Install
+pip install autocut-sub
+```
+
+```python
+from autocut import Transcribe, load_audio
+```
+
+
 **2023.10.14更新**：支持 faster-whisper 和指定依赖（但由于 Action 限制暂时移除了 faster-whisper 的测试运行）
 
 ```shell

diff --git a/autocut/__init__.py b/autocut/__init__.py
@@ -1 +1,7 @@
-__version__ = "0.2.0"
+__version__ = "1.0.1"
+
+from .type import LANG, WhisperModel, WhisperMode
+from .utils import load_audio
+from .package_transcribe import Transcribe
+
+__all__ = ["Transcribe", "load_audio", "WhisperMode", "WhisperModel", "LANG"]
diff --git a/autocut/package_transcribe.py b/autocut/package_transcribe.py
@@ -0,0 +1,94 @@
+import logging
+import time
+from typing import List, Any, Union, Literal
+
+import numpy as np
+import torch
+
+from . import utils, whisper_model
+from .type import WhisperMode, SPEECH_ARRAY_INDEX, WhisperModel, LANG
+
+
+class Transcribe:
+    def __init__(
+        self,
+        whisper_mode: Union[
+            WhisperMode.WHISPER.value, WhisperMode.FASTER.value
+        ] = WhisperMode.WHISPER.value,
+        whisper_model_size: WhisperModel.get_values() = "small",
+        vad: bool = True,
+        device: Union[Literal["cpu", "cuda"], None] = None,
+    ):
+        self.whisper_mode = whisper_mode
+        self.whisper_model_size = whisper_model_size
+        self.vad = vad
+        self.device = device
+        self.sampling_rate = 16000
+        self.whisper_model = None
+        self.vad_model = None
+        self.detect_speech = None
+
+        tic = time.time()
+        if self.whisper_model is None:
+            if self.whisper_mode == WhisperMode.WHISPER.value:
+                self.whisper_model = whisper_model.WhisperModel(self.sampling_rate)
+                self.whisper_model.load(self.whisper_model_size, self.device)
+            elif self.whisper_mode == WhisperMode.FASTER.value:
+                self.whisper_model = whisper_model.FasterWhisperModel(
+                    self.sampling_rate
+                )
+                self.whisper_model.load(self.whisper_model_size, self.device)
+        logging.info(f"Done Init model in {time.time() - tic:.1f} sec")
+
+    def run(self, audio: np.ndarray, lang: LANG, prompt: str = ""):
+        speech_array_indices = self._detect_voice_activity(audio)
+        transcribe_results = self._transcribe(audio, speech_array_indices, lang, prompt)
+        return transcribe_results
+
+    def format_results_to_srt(self, transcribe_results: List[Any]):
+        return self.whisper_model.gen_srt(transcribe_results)
+
+    def _detect_voice_activity(self, audio) -> List[SPEECH_ARRAY_INDEX]:
+        """Detect segments that have voice activities"""
+        if self.vad is False:
+            return [{"start": 0, "end": len(audio)}]
+
+        tic = time.time()
+        if self.vad_model is None or self.detect_speech is None:
+            # torch load limit https://github.com/pytorch/vision/issues/4156
+            torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
+            self.vad_model, funcs = torch.hub.load(
+                repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True
+            )
+
+            self.detect_speech = funcs[0]
+
+        speeches = self.detect_speech(
+            audio, self.vad_model, sampling_rate=self.sampling_rate
+        )
+
+        # Remove too short segments
+        speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate)
+
+        # Expand to avoid to tight cut. You can tune the pad length
+        speeches = utils.expand_segments(
+            speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0]
+        )
+
+        # Merge very closed segments
+        speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate)
+
+        logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec")
+        return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}]
+
+    def _transcribe(
+        self,
+        audio: np.ndarray,
+        speech_array_indices: List[SPEECH_ARRAY_INDEX],
+        lang: LANG,
+        prompt: str = "",
+    ) -> List[Any]:
+        tic = time.time()
+        res = self.whisper_model.transcribe(audio, speech_array_indices, lang, prompt)
+        logging.info(f"Done transcription in {time.time() - tic:.1f} sec")
+        return res
diff --git a/setup.py b/setup.py
@@ -14,8 +14,12 @@
 
 
 setup(
-    name="autocut",
+    name="autocut-sub",
     install_requires=requirements,
+    url="https://github.com/mli/autocut",
+    license="Apache License 2.0",
+    long_description=open('README.md').read(),
+    long_description_content_type='text/markdown',
     extras_require={
         "all": ["openai", "faster-whisper"],
         "openai": ["openai"],