Files
ocr/test005.py

83 lines
2.2 KiB
Python
Raw Normal View History

2025-08-14 16:04:59 +08:00
import whisper
import math
from pydub import AudioSegment # pip install pydub
# --------- 参数 ----------
MODEL_NAME = "turbo"
DEVICE = "cuda"
AUDIO_PATH = "audio.mp3"
SEGMENT_SEC = 10 # 每段 10 秒,越小粒度越细
OVERLAP_SEC = 0
LANGUAGE = "zh"
INITIAL_PROMPT = "这是一段小说。"
# -------------------------
# 1. 把音频切成小段
audio = AudioSegment.from_file(AUDIO_PATH)
total_ms = len(audio) # 总时长(毫秒)
segment_ms = SEGMENT_SEC * 1000
step_ms = (SEGMENT_SEC - OVERLAP_SEC) * 1000
segments = [
audio[i:i + segment_ms] # pydub 切片
for i in range(0, total_ms, step_ms)
]
total_chunks = len(segments)
print(f"音频共 {total_ms / 1000:.1f} 秒,切成 {total_chunks} 段处理")
# 2. 加载模型
model = whisper.load_model(MODEL_NAME, device=DEVICE)
# 3. 逐段转录 + 打印进度
all_segments = []
def percentage_of(total: int, part: int) -> int:
if total <= 0:
raise ValueError("第一个参数(总数)必须大于 0")
if part >= total:
return 100
return int(round((part / total) * 100))
for idx, seg in enumerate(segments, 1):
# 临时保存为 wav让 Whisper 读取
tmp_wav = f"_tmp_{idx}.wav"
seg.export(tmp_wav, format="wav")
result = model.transcribe(
tmp_wav,
language=LANGUAGE,
initial_prompt=INITIAL_PROMPT,
verbose=False,
word_timestamps=False
)
all_segments.append(result)
# 计算并打印进度
progress = idx / total_chunks * 100
print(str(percentage_of(total_chunks, idx)) + "%")
# 删除临时文件
import os;
os.remove(tmp_wav)
print("\n转录完成!")
# 4. 合并结果,按时间轴输出
def seconds_to_min_sec(total_seconds: int) -> str:
minutes, seconds = divmod(total_seconds, 60)
return f"{minutes}{seconds:02d}"
current_start = 0
for chunk_result in all_segments:
for seg in chunk_result["segments"]:
start = current_start + seg["start"]
end = current_start + seg["end"]
print(f"时间:{seconds_to_min_sec(int(start))} ~ {seconds_to_min_sec(int(end))};文本:{seg['text'].strip()}")
current_start += SEGMENT_SEC - OVERLAP_SEC