diff --git a/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/requirements.txt b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/requirements.txt index d19365af..2f953e58 100644 --- a/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/requirements.txt +++ b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/requirements.txt @@ -28,3 +28,4 @@ PySocks==1.7.1 dashscope==1.23.1 aiomysql==0.2.0 asyncio==3.4.3 +resemblyzer==0.1.4 diff --git a/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/huanghai_test.wav b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/huanghai_test.wav new file mode 100644 index 00000000..803ee004 Binary files /dev/null and b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/huanghai_test.wav differ diff --git a/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/known_speakers/hh/HH.wav b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/known_speakers/hh/HH.wav new file mode 100644 index 00000000..cdb241bb Binary files /dev/null and b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/known_speakers/hh/HH.wav differ diff --git a/XiaoZhi/文档/【黄海】今天的天气怎么样.wav b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/known_speakers/hh/【黄海】今天的天气怎么样.wav similarity index 100% rename from XiaoZhi/文档/【黄海】今天的天气怎么样.wav rename to XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/known_speakers/hh/【黄海】今天的天气怎么样.wav diff --git a/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/known_speakers/wb/WB.wav b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/known_speakers/wb/WB.wav new file mode 100644 index 00000000..cdc4f6c5 Binary files /dev/null and b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/known_speakers/wb/WB.wav differ diff --git a/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/testResemblyzer.py b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/testResemblyzer.py new file mode 100644 index 00000000..40ed31c3 --- /dev/null +++ b/XiaoZhi/xiaozhi-esp32-server/main/xiaozhi-server/test/testResemblyzer.py @@ -0,0 +1,100 @@ +from resemblyzer import preprocess_wav, VoiceEncoder +from pathlib import Path +import numpy as np +import soundfile as sf +import io + +# 初始化声纹编码器 +encoder = VoiceEncoder() + +# 存储已知声纹和对应的说话人名称 +known_speakers = {} + + +def load_known_speakers(speaker_folder): + """ + 加载已知说话人的声纹 + :param speaker_folder: 存储说话人音频文件的文件夹路径 + """ + speaker_folder = Path(speaker_folder) + for speaker_dir in speaker_folder.iterdir(): + if speaker_dir.is_dir(): + speaker_name = speaker_dir.name + wav_files = list(speaker_dir.glob("*.wav")) + embeddings = [] + for wav_file in wav_files: + wav = preprocess_wav(wav_file) + embedding = encoder.embed_utterance(wav) + embeddings.append(embedding) + if embeddings: + known_speakers[speaker_name] = np.mean(embeddings, axis=0) + print(f"已加载说话人: {speaker_name}") + + +def recognize_speaker_from_file(audio_file_path, threshold=0.7): + """ + 从音频文件识别说话人 + :param audio_file_path: 音频文件路径 + :param threshold: 相似度阈值,低于此值认为是未知说话人 + :return: (说话人名称, 相似度) + """ + try: + # 预处理音频文件 + wav = preprocess_wav(audio_file_path) + # 获取声纹特征 + embedding = encoder.embed_utterance(wav) + + best_similarity = -1 + best_speaker = None + + # 与已知说话人进行比对 + for speaker, known_embedding in known_speakers.items(): + similarity = np.dot(embedding, known_embedding) + if similarity > best_similarity: + best_similarity = similarity + best_speaker = speaker + + # 如果相似度低于阈值,认为是未知说话人 + if best_similarity < threshold: + return "未知说话人", best_similarity + + return best_speaker, best_similarity + + except Exception as e: + print(f"识别说话人时出错: {str(e)}") + return None, 0 + + +def recognize_speaker(audio): + """ + 识别说话人 + :param audio: 音频数据 + :return: 说话人名称 + """ + wav = preprocess_wav(audio.get_wav_data()) + embedding = encoder.embed_utterance(wav) + best_similarity = -1 + best_speaker = None + for speaker, known_embedding in known_speakers.items(): + similarity = np.dot(embedding, known_embedding) + if similarity > best_similarity: + best_similarity = similarity + best_speaker = speaker + return best_speaker + + +if __name__ == "__main__": + # 加载已知说话人的声纹 + speaker_folder = "known_speakers" # 已知说话人音频文件夹 + load_known_speakers(speaker_folder) + + # 要识别的音频文件路径 + test_audio_path = "huanghai_test.wav" # 替换为你要识别的音频文件路径 + # 识别说话人 + speaker, similarity = recognize_speaker_from_file(test_audio_path) + + if speaker: + print(f"识别结果: {speaker}") + print(f"相似度: {similarity:.4f}") + else: + print("无法识别说话人") \ No newline at end of file