You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

99 lines
3.3 KiB

from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path
import numpy as np
# 初始化声纹编码器
encoder = VoiceEncoder()
# 存储已知声纹和对应的说话人名称
known_speakers = {}
def load_known_speakers(speaker_folder):
"""
加载已知说话人的声纹
:param speaker_folder: 存储说话人音频文件的文件夹路径
"""
speaker_folder = Path(speaker_folder)
for speaker_dir in speaker_folder.iterdir():
if speaker_dir.is_dir():
speaker_name = speaker_dir.name
wav_files = list(speaker_dir.glob("*.wav"))
embeddings = []
for wav_file in wav_files:
wav = preprocess_wav(wav_file)
embedding = encoder.embed_utterance(wav)
embeddings.append(embedding)
if embeddings:
known_speakers[speaker_name] = np.mean(embeddings, axis=0)
print(f"已加载说话人: {speaker_name}")
def recognize_speaker_from_file(audio_file_path, threshold=0.7):
"""
从音频文件识别说话人
:param audio_file_path: 音频文件路径
:param threshold: 相似度阈值,低于此值认为是未知说话人
:return: (说话人名称, 相似度)
"""
try:
# 预处理音频文件
wav = preprocess_wav(audio_file_path)
# 获取声纹特征
embedding = encoder.embed_utterance(wav)
best_similarity = -1
best_speaker = None
# 与已知说话人进行比对
for speaker, known_embedding in known_speakers.items():
similarity = np.dot(embedding, known_embedding)
if similarity > best_similarity:
best_similarity = similarity
best_speaker = speaker
# 如果相似度低于阈值,认为是未知说话人
if best_similarity < threshold:
return "未知说话人", best_similarity
return best_speaker, best_similarity
except Exception as e:
print(f"识别说话人时出错: {str(e)}")
return None, 0
def recognize_speaker(audio):
"""
识别说话人
:param audio: 音频数据
:return: 说话人名称
"""
wav = preprocess_wav(audio.get_wav_data())
embedding = encoder.embed_utterance(wav)
best_similarity = -1
best_speaker = None
for speaker, known_embedding in known_speakers.items():
similarity = np.dot(embedding, known_embedding)
if similarity > best_similarity:
best_similarity = similarity
best_speaker = speaker
return best_speaker
if __name__ == "__main__":
# 加载已知说话人的声纹
speaker_folder = "known_speakers" # 已知说话人音频文件夹
load_known_speakers(speaker_folder)
# 要识别的音频文件路径
#test_audio_path = "huanghai_test.wav" # 替换为你要识别的音频文件路径
test_audio_path = "wubin_test.wav" # 替换为你要识别的音频文件路径
# 识别说话人
speaker, similarity = recognize_speaker_from_file(test_audio_path)
if speaker:
print(f"识别结果: {speaker}")
print(f"相似度: {similarity:.4f}")
else:
print("无法识别说话人")