ocr/test005.py

import whisper
import math
from pydub import AudioSegment  # pip install pydub

# --------- 参数 ----------
MODEL_NAME = "turbo"
DEVICE = "cuda"
AUDIO_PATH = "audio.mp3"
SEGMENT_SEC = 10  # 每段 10 秒，越小粒度越细
OVERLAP_SEC = 0
LANGUAGE = "zh"
INITIAL_PROMPT = "这是一段小说。"
# -------------------------

# 1. 把音频切成小段
audio = AudioSegment.from_file(AUDIO_PATH)
total_ms = len(audio)  # 总时长（毫秒）
segment_ms = SEGMENT_SEC * 1000
step_ms = (SEGMENT_SEC - OVERLAP_SEC) * 1000

segments = [
    audio[i:i + segment_ms]  # pydub 切片
    for i in range(0, total_ms, step_ms)
]

total_chunks = len(segments)
print(f"音频共 {total_ms / 1000:.1f} 秒，切成 {total_chunks} 段处理")

# 2. 加载模型
model = whisper.load_model(MODEL_NAME, device=DEVICE)

# 3. 逐段转录 + 打印进度
all_segments = []


def percentage_of(total: int, part: int) -> int:
    if total <= 0:
        raise ValueError("第一个参数（总数）必须大于 0")
    if part >= total:
        return 100
    return int(round((part / total) * 100))


for idx, seg in enumerate(segments, 1):
    # 临时保存为 wav，让 Whisper 读取
    tmp_wav = f"_tmp_{idx}.wav"
    seg.export(tmp_wav, format="wav")

    result = model.transcribe(
        tmp_wav,
        language=LANGUAGE,
        initial_prompt=INITIAL_PROMPT,
        verbose=False,
        word_timestamps=False
    )
    all_segments.append(result)

    # 计算并打印进度
    progress = idx / total_chunks * 100
    print(str(percentage_of(total_chunks, idx)) + "%")

    # 删除临时文件
    import os;

    os.remove(tmp_wav)

print("\n转录完成！")


# 4. 合并结果，按时间轴输出
def seconds_to_min_sec(total_seconds: int) -> str:
    minutes, seconds = divmod(total_seconds, 60)
    return f"{minutes}分{seconds:02d}秒"


current_start = 0
for chunk_result in all_segments:
    for seg in chunk_result["segments"]:
        start = current_start + seg["start"]
        end = current_start + seg["end"]
        print(f"时间：{seconds_to_min_sec(int(start))} ~ {seconds_to_min_sec(int(end))}；文本：{seg['text'].strip()}")
    current_start += SEGMENT_SEC - OVERLAP_SEC