import whisper import math from pydub import AudioSegment # pip install pydub # --------- 参数 ---------- MODEL_NAME = "turbo" DEVICE = "cuda" AUDIO_PATH = "audio.mp3" SEGMENT_SEC = 10 # 每段 10 秒,越小粒度越细 OVERLAP_SEC = 0 LANGUAGE = "zh" INITIAL_PROMPT = "这是一段小说。" # ------------------------- # 1. 把音频切成小段 audio = AudioSegment.from_file(AUDIO_PATH) total_ms = len(audio) # 总时长(毫秒) segment_ms = SEGMENT_SEC * 1000 step_ms = (SEGMENT_SEC - OVERLAP_SEC) * 1000 segments = [ audio[i:i + segment_ms] # pydub 切片 for i in range(0, total_ms, step_ms) ] total_chunks = len(segments) print(f"音频共 {total_ms / 1000:.1f} 秒,切成 {total_chunks} 段处理") # 2. 加载模型 model = whisper.load_model(MODEL_NAME, device=DEVICE) # 3. 逐段转录 + 打印进度 all_segments = [] def percentage_of(total: int, part: int) -> int: if total <= 0: raise ValueError("第一个参数(总数)必须大于 0") if part >= total: return 100 return int(round((part / total) * 100)) for idx, seg in enumerate(segments, 1): # 临时保存为 wav,让 Whisper 读取 tmp_wav = f"_tmp_{idx}.wav" seg.export(tmp_wav, format="wav") result = model.transcribe( tmp_wav, language=LANGUAGE, initial_prompt=INITIAL_PROMPT, verbose=False, word_timestamps=False ) all_segments.append(result) # 计算并打印进度 progress = idx / total_chunks * 100 print(str(percentage_of(total_chunks, idx)) + "%") # 删除临时文件 import os; os.remove(tmp_wav) print("\n转录完成!") # 4. 合并结果,按时间轴输出 def seconds_to_min_sec(total_seconds: int) -> str: minutes, seconds = divmod(total_seconds, 60) return f"{minutes}分{seconds:02d}秒" current_start = 0 for chunk_result in all_segments: for seg in chunk_result["segments"]: start = current_start + seg["start"] end = current_start + seg["end"] print(f"时间:{seconds_to_min_sec(int(start))} ~ {seconds_to_min_sec(int(end))};文本:{seg['text'].strip()}") current_start += SEGMENT_SEC - OVERLAP_SEC