Skip to content

Commit

Permalink
Merge pull request #122 from chenqianhe/main
Browse files Browse the repository at this point in the history
feat: 发布 Pypi Package
  • Loading branch information
chenqianhe authored Mar 10, 2024
2 parents 9c9fdf2 + 101bbc3 commit ec4b32d
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 2 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@

AutoCut 对你的视频自动生成字幕。然后你选择需要保留的句子,AutoCut 将对你视频中对应的片段裁切并保存。你无需使用视频编辑软件,只需要编辑文本文件即可完成剪切。

**2024.03.10更新**:支持 pip 安装和提供 import 转录相关的功能

```shell
# Install
pip install autocut-sub
```

```python
from autocut import Transcribe, load_audio
```


**2023.10.14更新**:支持 faster-whisper 和指定依赖(但由于 Action 限制暂时移除了 faster-whisper 的测试运行)

```shell
Expand Down
8 changes: 7 additions & 1 deletion autocut/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
__version__ = "0.2.0"
__version__ = "1.0.1"

from .type import LANG, WhisperModel, WhisperMode
from .utils import load_audio
from .package_transcribe import Transcribe

__all__ = ["Transcribe", "load_audio", "WhisperMode", "WhisperModel", "LANG"]
94 changes: 94 additions & 0 deletions autocut/package_transcribe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import logging
import time
from typing import List, Any, Union, Literal

import numpy as np
import torch

from . import utils, whisper_model
from .type import WhisperMode, SPEECH_ARRAY_INDEX, WhisperModel, LANG


class Transcribe:
def __init__(
self,
whisper_mode: Union[
WhisperMode.WHISPER.value, WhisperMode.FASTER.value
] = WhisperMode.WHISPER.value,
whisper_model_size: WhisperModel.get_values() = "small",
vad: bool = True,
device: Union[Literal["cpu", "cuda"], None] = None,
):
self.whisper_mode = whisper_mode
self.whisper_model_size = whisper_model_size
self.vad = vad
self.device = device
self.sampling_rate = 16000
self.whisper_model = None
self.vad_model = None
self.detect_speech = None

tic = time.time()
if self.whisper_model is None:
if self.whisper_mode == WhisperMode.WHISPER.value:
self.whisper_model = whisper_model.WhisperModel(self.sampling_rate)
self.whisper_model.load(self.whisper_model_size, self.device)
elif self.whisper_mode == WhisperMode.FASTER.value:
self.whisper_model = whisper_model.FasterWhisperModel(
self.sampling_rate
)
self.whisper_model.load(self.whisper_model_size, self.device)
logging.info(f"Done Init model in {time.time() - tic:.1f} sec")

def run(self, audio: np.ndarray, lang: LANG, prompt: str = ""):
speech_array_indices = self._detect_voice_activity(audio)
transcribe_results = self._transcribe(audio, speech_array_indices, lang, prompt)
return transcribe_results

def format_results_to_srt(self, transcribe_results: List[Any]):
return self.whisper_model.gen_srt(transcribe_results)

def _detect_voice_activity(self, audio) -> List[SPEECH_ARRAY_INDEX]:
"""Detect segments that have voice activities"""
if self.vad is False:
return [{"start": 0, "end": len(audio)}]

tic = time.time()
if self.vad_model is None or self.detect_speech is None:
# torch load limit https://github.com/pytorch/vision/issues/4156
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
self.vad_model, funcs = torch.hub.load(
repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True
)

self.detect_speech = funcs[0]

speeches = self.detect_speech(
audio, self.vad_model, sampling_rate=self.sampling_rate
)

# Remove too short segments
speeches = utils.remove_short_segments(speeches, 1.0 * self.sampling_rate)

# Expand to avoid to tight cut. You can tune the pad length
speeches = utils.expand_segments(
speeches, 0.2 * self.sampling_rate, 0.0 * self.sampling_rate, audio.shape[0]
)

# Merge very closed segments
speeches = utils.merge_adjacent_segments(speeches, 0.5 * self.sampling_rate)

logging.info(f"Done voice activity detection in {time.time() - tic:.1f} sec")
return speeches if len(speeches) > 1 else [{"start": 0, "end": len(audio)}]

def _transcribe(
self,
audio: np.ndarray,
speech_array_indices: List[SPEECH_ARRAY_INDEX],
lang: LANG,
prompt: str = "",
) -> List[Any]:
tic = time.time()
res = self.whisper_model.transcribe(audio, speech_array_indices, lang, prompt)
logging.info(f"Done transcription in {time.time() - tic:.1f} sec")
return res
6 changes: 5 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@


setup(
name="autocut",
name="autocut-sub",
install_requires=requirements,
url="https://github.com/mli/autocut",
license="Apache License 2.0",
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
extras_require={
"all": ["openai", "faster-whisper"],
"openai": ["openai"],
Expand Down

0 comments on commit ec4b32d

Please sign in to comment.