|
|
|
|
from resemblyzer import preprocess_wav, VoiceEncoder
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
import numpy as np
|
|
|
|
|
import soundfile as sf
|
|
|
|
|
import io
|
|
|
|
|
|
|
|
|
|
# 初始化声纹编码器
|
|
|
|
|
encoder = VoiceEncoder()
|
|
|
|
|
|
|
|
|
|
# 存储已知声纹和对应的说话人名称
|
|
|
|
|
known_speakers = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_known_speakers(speaker_folder):
|
|
|
|
|
"""
|
|
|
|
|
加载已知说话人的声纹
|
|
|
|
|
:param speaker_folder: 存储说话人音频文件的文件夹路径
|
|
|
|
|
"""
|
|
|
|
|
speaker_folder = Path(speaker_folder)
|
|
|
|
|
for speaker_dir in speaker_folder.iterdir():
|
|
|
|
|
if speaker_dir.is_dir():
|
|
|
|
|
speaker_name = speaker_dir.name
|
|
|
|
|
wav_files = list(speaker_dir.glob("*.wav"))
|
|
|
|
|
embeddings = []
|
|
|
|
|
for wav_file in wav_files:
|
|
|
|
|
wav = preprocess_wav(wav_file)
|
|
|
|
|
embedding = encoder.embed_utterance(wav)
|
|
|
|
|
embeddings.append(embedding)
|
|
|
|
|
if embeddings:
|
|
|
|
|
known_speakers[speaker_name] = np.mean(embeddings, axis=0)
|
|
|
|
|
print(f"已加载说话人: {speaker_name}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def recognize_speaker_from_file(audio_file_path, threshold=0.7):
|
|
|
|
|
"""
|
|
|
|
|
从音频文件识别说话人
|
|
|
|
|
:param audio_file_path: 音频文件路径
|
|
|
|
|
:param threshold: 相似度阈值,低于此值认为是未知说话人
|
|
|
|
|
:return: (说话人名称, 相似度)
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
# 预处理音频文件
|
|
|
|
|
wav = preprocess_wav(audio_file_path)
|
|
|
|
|
# 获取声纹特征
|
|
|
|
|
embedding = encoder.embed_utterance(wav)
|
|
|
|
|
|
|
|
|
|
best_similarity = -1
|
|
|
|
|
best_speaker = None
|
|
|
|
|
|
|
|
|
|
# 与已知说话人进行比对
|
|
|
|
|
for speaker, known_embedding in known_speakers.items():
|
|
|
|
|
similarity = np.dot(embedding, known_embedding)
|
|
|
|
|
if similarity > best_similarity:
|
|
|
|
|
best_similarity = similarity
|
|
|
|
|
best_speaker = speaker
|
|
|
|
|
|
|
|
|
|
# 如果相似度低于阈值,认为是未知说话人
|
|
|
|
|
if best_similarity < threshold:
|
|
|
|
|
return "未知说话人", best_similarity
|
|
|
|
|
|
|
|
|
|
return best_speaker, best_similarity
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"识别说话人时出错: {str(e)}")
|
|
|
|
|
return None, 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def recognize_speaker(audio):
|
|
|
|
|
"""
|
|
|
|
|
识别说话人
|
|
|
|
|
:param audio: 音频数据
|
|
|
|
|
:return: 说话人名称
|
|
|
|
|
"""
|
|
|
|
|
wav = preprocess_wav(audio.get_wav_data())
|
|
|
|
|
embedding = encoder.embed_utterance(wav)
|
|
|
|
|
best_similarity = -1
|
|
|
|
|
best_speaker = None
|
|
|
|
|
for speaker, known_embedding in known_speakers.items():
|
|
|
|
|
similarity = np.dot(embedding, known_embedding)
|
|
|
|
|
if similarity > best_similarity:
|
|
|
|
|
best_similarity = similarity
|
|
|
|
|
best_speaker = speaker
|
|
|
|
|
return best_speaker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# 加载已知说话人的声纹
|
|
|
|
|
speaker_folder = "known_speakers" # 已知说话人音频文件夹
|
|
|
|
|
load_known_speakers(speaker_folder)
|
|
|
|
|
|
|
|
|
|
# 要识别的音频文件路径
|
|
|
|
|
test_audio_path = "huanghai_test.wav" # 替换为你要识别的音频文件路径
|
|
|
|
|
# 识别说话人
|
|
|
|
|
speaker, similarity = recognize_speaker_from_file(test_audio_path)
|
|
|
|
|
|
|
|
|
|
if speaker:
|
|
|
|
|
print(f"识别结果: {speaker}")
|
|
|
|
|
print(f"相似度: {similarity:.4f}")
|
|
|
|
|
else:
|
|
|
|
|
print("无法识别说话人")
|