import json import asyncio import time from core.providers.tts.dto.dto import SentenceType from core.utils.util import get_string_no_punctuation_or_emoji, analyze_emotion from loguru import logger TAG = __name__ emoji_map = { "neutral": "😶", "happy": "🙂", "laughing": "😆", "funny": "😂", "sad": "😔", "angry": "😠", "crying": "😭", "loving": "😍", "embarrassed": "😳", "surprised": "😲", "shocked": "😱", "thinking": "🤔", "winking": "😉", "cool": "😎", "relaxed": "😌", "delicious": "🤤", "kissy": "😘", "confident": "😏", "sleepy": "😴", "silly": "😜", "confused": "🙄", } async def sendAudioMessage(conn, sentenceType, audios, text): # 发送句子开始消息 conn.logger.bind(tag=TAG).info(f"发送音频消息: {sentenceType}, {text}") if text is not None: emotion = analyze_emotion(text) emoji = emoji_map.get(emotion, "🙂") # 默认使用笑脸 await conn.websocket.send( json.dumps( { "type": "llm", "text": emoji, "emotion": emotion, "session_id": conn.session_id, } ) ) pre_buffer = False if conn.tts.tts_audio_first_sentence and text is not None: conn.logger.bind(tag=TAG).info(f"发送第一段语音: {text}") conn.tts.tts_audio_first_sentence = False pre_buffer = True await send_tts_message(conn, "sentence_start", text) await sendAudio(conn, audios, pre_buffer) await send_tts_message(conn, "sentence_end", text) # 发送结束消息(如果是最后一个文本) if conn.llm_finish_task and sentenceType == SentenceType.LAST: await send_tts_message(conn, "stop", None) conn.client_is_speaking = False if conn.close_after_chat: await conn.close() # 播放音频 async def sendAudio(conn, audios, pre_buffer=True): if audios is None or len(audios) == 0: return # 流控参数优化 frame_duration = 60 # 帧时长(毫秒),匹配 Opus 编码 start_time = time.perf_counter() play_position = 0 # 仅当第一句话时执行预缓冲 if pre_buffer: pre_buffer_frames = min(3, len(audios)) for i in range(pre_buffer_frames): await conn.websocket.send(audios[i]) remaining_audios = audios[pre_buffer_frames:] else: remaining_audios = audios # 播放剩余音频帧 for opus_packet in remaining_audios: if conn.client_abort: break # 重置没有声音的状态 conn.last_activity_time = time.time() * 1000 # 计算预期发送时间 expected_time = start_time + (play_position / 1000) current_time = time.perf_counter() delay = expected_time - current_time if delay > 0: await asyncio.sleep(delay) await conn.websocket.send(opus_packet) play_position += frame_duration async def send_tts_message(conn, state, text=None): """发送 TTS 状态消息""" message = {"type": "tts", "state": state, "session_id": conn.session_id} if text is not None: message["text"] = text # TTS播放结束 if state == "stop": # 播放提示音 tts_notify = conn.config.get("enable_stop_tts_notify", False) if tts_notify: stop_tts_notify_voice = conn.config.get( "stop_tts_notify_voice", "config/assets/tts_notify.mp3" ) audios, _ = conn.tts.audio_to_opus_data(stop_tts_notify_voice) await sendAudio(conn, audios) # 清除服务端讲话状态 conn.clearSpeakStatus() # 发送消息到客户端 await conn.websocket.send(json.dumps(message)) async def send_stt_message(conn, text): end_prompt_str = conn.config.get("end_prompt", {}).get("prompt") if end_prompt_str and end_prompt_str == text: await send_tts_message(conn, "start") return """发送 STT 状态消息""" # 解析JSON格式,提取实际的用户说话内容 display_text = text try: # 尝试解析JSON格式 if text.strip().startswith('{') and text.strip().endswith('}'): parsed_data = json.loads(text) if isinstance(parsed_data, dict) and "content" in parsed_data: # 如果是包含说话人信息的JSON格式,只显示content部分 display_text = parsed_data["content"] except (json.JSONDecodeError, TypeError): # 如果不是JSON格式,直接使用原始文本 display_text = text stt_text = get_string_no_punctuation_or_emoji(display_text) await conn.websocket.send( json.dumps({"type": "stt", "text": stt_text, "session_id": conn.session_id}) ) conn.client_is_speaking = True await send_tts_message(conn, "start")