语音设置
+ +生成结果
+ +正在生成语音,请稍候...
+暂无生成结果
+diff --git a/dsLightRag/Routes/VideoRetalkRoute.py b/dsLightRag/Routes/VideoRetalkRoute.py new file mode 100644 index 00000000..dfa59edc --- /dev/null +++ b/dsLightRag/Routes/VideoRetalkRoute.py @@ -0,0 +1,132 @@ +import datetime +import logging +import uuid +from typing import Optional + +from fastapi import APIRouter, HTTPException, Query, Request +from fastapi.responses import JSONResponse +from pydantic import BaseModel + +from Config import Config +from Util.VideoRetalk import VideoRetalk + +# 创建视频生成路由 +router = APIRouter(prefix="/api/video", tags=["视频生成"]) + +# 配置日志 +logger = logging.getLogger(__name__) + +# 仅保留视频相关模型定义 +class VideoRetalkRequest(BaseModel): + """视频生成请求参数""" + image_url: str + audio_url: str + template_id: Optional[str] = "normal" + eye_move_freq: Optional[float] = 0.5 + video_fps: Optional[int] = 30 + mouth_move_strength: Optional[float] = 1.0 + paste_back: Optional[bool] = True + head_move_strength: Optional[float] = 0.7 + + +class VideoRetalkResponse(BaseModel): + """视频生成响应""" + success: bool + message: str + task_id: Optional[str] = None + video_url: Optional[str] = None + video_duration: Optional[float] = None + video_ratio: Optional[str] = None + request_id: Optional[str] = None + + +@router.post("/generate", response_model=VideoRetalkResponse) +async def generate_video(request: VideoRetalkRequest): + """ + 生成人物朗读视频接口 + 根据输入的人物图片和音频,生成口型匹配的朗读视频 + """ + try: + # 初始化VideoRetalk实例 + video_retalk = VideoRetalk(Config.ALY_LLM_API_KEY) + + # 调用视频生成方法 + video_url = video_retalk.generate_video( + image_url=request.image_url, + audio_url=request.audio_url, + template_id=request.template_id, + eye_move_freq=request.eye_move_freq, + video_fps=request.video_fps, + mouth_move_strength=request.mouth_move_strength, + paste_back=request.paste_back, + head_move_strength=request.head_move_strength + ) + + if video_url: + return VideoRetalkResponse( + success=True, + message="视频生成成功", + video_url=video_url, + # 以下字段在实际实现中可以从API响应中获取 + task_id=str(uuid.uuid4()), + video_duration=10.23, # 示例值,实际应从API响应获取 + video_ratio="standard", # 示例值,实际应从API响应获取 + request_id=str(uuid.uuid4()) + ) + else: + return VideoRetalkResponse( + success=False, + message="视频生成失败" + ) + + except Exception as e: + logger.error(f"视频生成接口错误: {e}") + raise HTTPException( + status_code=500, + detail=f"视频生成失败: {str(e)}" + ) + + +@router.get("/task/status") +async def get_task_status(task_id: str = Query(..., description="任务ID")): + """ + 查询视频生成任务状态 + """ + try: + video_retalk = VideoRetalk(Config.ALY_LLM_API_KEY) + task_status = video_retalk.get_task_status(task_id) + + return { + "success": True, + "data": task_status + } + + except Exception as e: + logger.error(f"查询任务状态错误: {e}") + raise HTTPException( + status_code=500, + detail=f"查询任务状态失败: {str(e)}" + ) + + +@router.get("/health") +async def health_check(): + """ + 健康检查接口 + """ + return { + "status": "healthy", + "timestamp": datetime.datetime.now().isoformat(), + "service": "VideoRetalk API" + } + + +# 保留全局异常处理 +def global_exception_handler(request: Request, exc: Exception): + logger.error(f"全局异常: {exc}") + return JSONResponse( + status_code=500, + content={"success": False, "message": f"服务器内部错误: {str(exc)}"} + ) + + diff --git a/dsLightRag/Routes/__pycache__/VideoRetalkRoute.cpython-310.pyc b/dsLightRag/Routes/__pycache__/VideoRetalkRoute.cpython-310.pyc new file mode 100644 index 00000000..9d7d585a Binary files /dev/null and b/dsLightRag/Routes/__pycache__/VideoRetalkRoute.cpython-310.pyc differ diff --git a/dsLightRag/Routes/__pycache__/ttsRoute.cpython-310.pyc b/dsLightRag/Routes/__pycache__/ttsRoute.cpython-310.pyc new file mode 100644 index 00000000..3a0e2f68 Binary files /dev/null and b/dsLightRag/Routes/__pycache__/ttsRoute.cpython-310.pyc differ diff --git a/dsLightRag/Routes/ttsRoute.py b/dsLightRag/Routes/ttsRoute.py new file mode 100644 index 00000000..17a2bc46 --- /dev/null +++ b/dsLightRag/Routes/ttsRoute.py @@ -0,0 +1,145 @@ +import logging +import uuid +from typing import Optional + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +from Util.GengerateAudio import ByteDanceTTS + +# 创建声音生成路由 +router = APIRouter(prefix="/api/tts", tags=["声音生成"]) + +# 配置日志 +logger = logging.getLogger(__name__) + +# 初始化TTS实例 +tts_instance = ByteDanceTTS() + +class TextToSpeechRequest(BaseModel): + """文本转语音请求参数""" + text: str + voice_type: Optional[str] = None + speed_ratio: Optional[float] = 1.0 + volume_ratio: Optional[float] = 1.0 + pitch_ratio: Optional[float] = 1.0 + encoding: Optional[str] = "mp3" + + +class TextToSpeechResponse(BaseModel): + """文本转语音响应""" + success: bool + message: str + audio_url: Optional[str] = None + audio_size: Optional[float] = None + audio_format: Optional[str] = None + request_id: Optional[str] = None + + +@router.get("/voices/categories") +async def get_voice_categories(): + """ + 获取所有音色分类接口 + 返回所有可用的音色分类列表 + """ + try: + categories = tts_instance.get_all_categories() + return { + "success": True, + "data": categories, + "message": "获取音色分类成功" + } + except Exception as e: + logger.error(f"获取音色分类错误: {e}") + raise HTTPException( + status_code=500, + detail=f"获取音色分类失败: {str(e)}" + ) + + +@router.get("/voices/by-category/{category}") +async def get_voices_by_category(category: str): + """ + 根据分类获取音色列表接口 + Args: + category: 音色分类名称 + 返回指定分类下的所有音色列表 + """ + try: + voices = tts_instance.get_voices_by_category(category) + if not voices: + return { + "success": False, + "message": f"未找到分类 '{category}' 下的音色" + } + + return { + "success": True, + "data": voices, + "message": f"获取分类 '{category}' 下的音色成功" + } + except Exception as e: + logger.error(f"获取分类 '{category}' 下的音色错误: {e}") + raise HTTPException( + status_code=500, + detail=f"获取分类 '{category}' 下的音色失败: {str(e)}" + ) + + +@router.get("/voices/all") +async def get_all_voices(): + """ + 获取所有音色分类和音色列表接口 + 返回所有音色分类和每个分类下的音色列表 + """ + try: + all_voices = tts_instance.get_all_voices() + return { + "success": True, + "data": all_voices, + "message": "获取所有音色分类和列表成功" + } + except Exception as e: + logger.error(f"获取所有音色分类和列表错误: {e}") + raise HTTPException( + status_code=500, + detail=f"获取所有音色分类和列表失败: {str(e)}" + ) + + +@router.post("/generate", response_model=TextToSpeechResponse) +async def generate_audio(request: TextToSpeechRequest): + """ + 文本转语音接口 + 根据输入文本和语音参数生成音频文件 + """ + try: + # 调用TTS工具生成音频 + audio_url = tts_instance.generate_audio( + text=request.text, + voice_type=request.voice_type, + speed_ratio=request.speed_ratio, + volume_ratio=request.volume_ratio, + pitch_ratio=request.pitch_ratio, + encoding=request.encoding + ) + + if audio_url: + return TextToSpeechResponse( + success=True, + message="音频生成成功", + audio_url=audio_url, + audio_format=request.encoding, + request_id=str(uuid.uuid4()) + ) + else: + return TextToSpeechResponse( + success=False, + message="音频生成失败" + ) + except Exception as e: + logger.error(f"文本转语音接口错误: {e}") + raise HTTPException( + status_code=500, + detail=f"音频生成失败: {str(e)}" + ) \ No newline at end of file diff --git a/dsLightRag/Start.py b/dsLightRag/Start.py index 71532839..340a9f2f 100644 --- a/dsLightRag/Start.py +++ b/dsLightRag/Start.py @@ -27,7 +27,8 @@ from Routes.TeachingModel.tasks.BackgroundTasks import train_document_task from Routes.XueBanRoute import router as xueban_router from Routes.ZuoWen import router as zuowen_router from Routes.RecognizeEduQuestion import router as ocr_router - +from Routes.VideoRetalkRoute import router as videoRetalk_router +from Routes.ttsRoute import router as tts_router # 控制日志输出 logger = logging.getLogger('lightrag') logger.setLevel(logging.INFO) @@ -80,6 +81,10 @@ app.include_router(mj_router) # Midjourney路由 app.include_router(qwen_image_router) # Qwen Image 路由 app.include_router(ocr_router) # 教育场景识别 +app.include_router(videoRetalk_router) # 视频复读 +app.include_router(tts_router) # 文本转语音 + + # Teaching Model 相关路由 # 登录相关(不用登录) app.include_router(login_router, prefix="/api/login", tags=["login"]) diff --git a/dsLightRag/Test/GengerateAudio.py b/dsLightRag/Test/GengerateAudio.py deleted file mode 100644 index 8285f645..00000000 --- a/dsLightRag/Test/GengerateAudio.py +++ /dev/null @@ -1,73 +0,0 @@ -#coding=utf-8 - -''' -requires Python 3.6 or later -pip install requests -''' -import base64 -import json -import uuid -import requests - -from Config.Config import HS_APP_ID, HS_ACCESS_TOKEN, HS_CLUSTER_ID, HS_VOICE_TYPE_QINCANG - -# 填写平台申请的appid, access_token以及cluster -appid = HS_APP_ID -access_token= HS_ACCESS_TOKEN -cluster = HS_CLUSTER_ID - -voice_type = HS_VOICE_TYPE_QINCANG -host = "openspeech.bytedance.com" -api_url = f"https://{host}/api/v1/tts" - -header = {"Authorization": f"Bearer;{access_token}"} - -request_json = { - "app": { - "appid": appid, - "token": "access_token", - "cluster": cluster - }, - "user": { - "uid": "388808087185088" - }, - "audio": { - "voice_type": voice_type, - "encoding": "mp3", - "speed_ratio": 1.0, - "volume_ratio": 1.0, - "pitch_ratio": 1.0, - }, - "request": { - "reqid": str(uuid.uuid4()), - "text": """ - 君不见,黄河之水天上来,奔流到海不复回。 - 君不见,高堂明镜悲白发,朝如青丝暮成雪。 - 人生得意须尽欢,莫使金樽空对月。 - 天生我材必有用,千金散尽还复来。 - 烹羊宰牛且为乐,会须一饮三百杯。 - 岑夫子,丹丘生,将进酒,杯莫停。 - 与君歌一曲,请君为我倾耳听。 - 钟鼓馔玉不足贵,但愿长醉不复醒。 - 古来圣贤皆寂寞,惟有饮者留其名。 - 陈王昔时宴平乐,斗酒十千恣欢谑。 - 主人何为言少钱,径须沽取对君酌。 - 五花马,千金裘,呼儿将出换美酒,与尔同销万古愁。 - """, - "text_type": "plain", - "operation": "query", - "with_frontend": 1, - "frontend_type": "unitTson" - } -} - -if __name__ == '__main__': - try: - resp = requests.post(api_url, json.dumps(request_json), headers=header) - #print(f"resp body: \n{resp.json()}") - if "data" in resp.json(): - data = resp.json()["data"] - file_to_save = open("test_submit.mp3", "wb") - file_to_save.write(base64.b64decode(data)) - except Exception as e: - e.with_traceback() diff --git a/dsLightRag/Util/GengerateAudio.py b/dsLightRag/Util/GengerateAudio.py new file mode 100644 index 00000000..0fe73504 --- /dev/null +++ b/dsLightRag/Util/GengerateAudio.py @@ -0,0 +1,284 @@ +#coding=utf-8 + +''' +字节跳动语音合成API封装类 +requires Python 3.6 or later +pip install requests +''' +import base64 +import json +import uuid +import requests +from typing import Optional, Dict, Any +from pathlib import Path + +from Config.Config import HS_APP_ID, HS_ACCESS_TOKEN, HS_CLUSTER_ID, HS_VOICE_TYPE_QINCANG + + +# 在ByteDanceTTS类中添加以下音色分类字典 + +class ByteDanceTTS: + """ + 字节跳动语音合成API封装类 + 提供文本转语音功能 + """ + + # 音色分类字典 + TTS_VOICES = { + "通用场景": { + "zh_female_xiaoxue_moon_bigtts": "小雪(女声,温柔亲切)", + "zh_male_xiaofeng_common": "小峰(男声,沉稳大气)", + "zh_female_xiaoxin_common": "小新(女声,自然流畅)", + "zh_male_xiaoyu_common": "小鱼(男声,年轻活力)" + }, + "有声阅读": { + "zh_female_xiaoxue_moon_bigtts": "小雪(女声,温柔亲切)", + "zh_female_xiaoxin_common": "小新(女声,自然流畅)", + "zh_female_xiaomei_moon_bigtts": "小美(女声,甜美温柔)", + "zh_female_xiaoli_moon_bigtts": "小丽(女声,清晰标准)" + }, + "智能助手": { + "zh_female_xiaoxue_moon_bigtts": "小雪(女声,温柔亲切)", + "zh_male_xiaofeng_common": "小峰(男声,沉稳大气)", + "zh_female_xiaoxin_common": "小新(女声,自然流畅)", + "zh_male_xiaoyu_common": "小鱼(男声,年轻活力)" + }, + "视频配音": { + "zh_male_xiaofeng_common": "小峰(男声,沉稳大气)", + "zh_female_xiaomei_moon_bigtts": "小美(女声,甜美温柔)", + "zh_female_xiaoli_moon_bigtts": "小丽(女声,清晰标准)", + "zh_male_xiaoyu_common": "小鱼(男声,年轻活力)" + }, + "特色音色": { + "zh_female_xiaoxue_moon_bigtts": "小雪(女声,温柔亲切)", + "zh_female_xiaomei_moon_bigtts": "小美(女声,甜美温柔)" + }, + "广告配音": { + "zh_male_xiaofeng_common": "小峰(男声,沉稳大气)", + "zh_female_xiaoli_moon_bigtts": "小丽(女声,清晰标准)" + }, + "新闻播报": { + "zh_female_xiaoli_moon_bigtts": "小丽(女声,清晰标准)", + "zh_male_xiaofeng_common": "小峰(男声,沉稳大气)" + }, + "教育场景": { + "zh_female_xiaoxin_common": "小新(女声,自然流畅)", + "zh_male_xiaoyu_common": "小鱼(男声,年轻活力)" + } + } + + def __init__(self, + app_id: Optional[str] = None, + access_token: Optional[str] = None, + cluster_id: Optional[str] = None, + voice_type: Optional[str] = None): + """ + 初始化语音合成类 + + Args: + app_id: 应用ID,默认为Config中的HS_APP_ID + access_token: 访问令牌,默认为Config中的HS_ACCESS_TOKEN + cluster_id: 集群ID,默认为Config中的HS_CLUSTER_ID + voice_type: 声音类型,默认为Config中的HS_VOICE_TYPE_QINCANG + """ + self.app_id = app_id or HS_APP_ID + self.access_token = access_token or HS_ACCESS_TOKEN + self.cluster_id = cluster_id or HS_CLUSTER_ID + self.voice_type = voice_type or HS_VOICE_TYPE_QINCANG + + self.host = "openspeech.bytedance.com" + self.api_url = f"https://{self.host}/api/v1/tts" + self.header = {"Authorization": f"Bearer;{self.access_token}"} + + def generate_audio(self, + text: str, + output_path: Optional[str] = None, + voice_type: Optional[str] = None, + encoding: str = "mp3", + speed_ratio: float = 1.0, + volume_ratio: float = 1.0, + pitch_ratio: float = 1.0, + text_type: str = "plain", + operation: str = "query") -> Optional[bytes]: + """ + 生成语音音频 + + Args: + text: 要转换的文本内容 + output_path: 输出文件路径,如果提供则保存为文件 + voice_type: 声音类型,覆盖初始化设置 + encoding: 音频编码格式,默认mp3 + speed_ratio: 语速比例,默认1.0 + volume_ratio: 音量比例,默认1.0 + pitch_ratio: 音调比例,默认1.0 + text_type: 文本类型,默认plain + operation: 操作类型,默认query + + Returns: + bytes: 音频二进制数据,失败返回None + """ + # 构建请求JSON + request_json = { + "app": { + "appid": self.app_id, + "token": "access_token", + "cluster": self.cluster_id + }, + "user": { + "uid": str(uuid.uuid4()) # 使用随机用户ID + }, + "audio": { + "voice_type": voice_type or self.voice_type, + "encoding": encoding, + "speed_ratio": speed_ratio, + "volume_ratio": volume_ratio, + "pitch_ratio": pitch_ratio, + }, + "request": { + "reqid": str(uuid.uuid4()), + "text": text, + "text_type": text_type, + "operation": operation, + "with_frontend": 1, + "frontend_type": "unitTson" + } + } + + try: + # 发送请求 + resp = requests.post(self.api_url, json.dumps(request_json), headers=self.header) + resp.raise_for_status() + + resp_data = resp.json() + + if "data" in resp_data: + audio_data = base64.b64decode(resp_data["data"]) + + # 如果提供了输出路径,保存文件 + if output_path: + self.save_audio(audio_data, output_path) + + return audio_data + else: + print(f"API响应中未包含音频数据: {resp_data}") + return None + + except requests.exceptions.RequestException as e: + print(f"请求失败: {e}") + return None + except Exception as e: + print(f"生成音频失败: {e}") + return None + + def save_audio(self, audio_data: bytes, output_path: str) -> bool: + """ + 保存音频数据到文件 + + Args: + audio_data: 音频二进制数据 + output_path: 输出文件路径 + + Returns: + bool: 保存是否成功 + """ + try: + # 确保目录存在 + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, "wb") as f: + f.write(audio_data) + + print(f"音频已保存到: {output_path}") + return True + + except Exception as e: + print(f"保存音频失败: {e}") + return False + + def get_audio_info(self, audio_data: bytes) -> Dict[str, Any]: + """ + 获取音频信息 + + Args: + audio_data: 音频二进制数据 + + Returns: + Dict: 包含音频大小和格式的信息 + """ + return { + "size_bytes": len(audio_data), + "size_kb": len(audio_data) / 1024, + "format": "mp3" # 目前固定为mp3格式 + } + + def get_voices_by_category(self, category: str) -> Dict[str, str]: + """ + 根据分类获取音色列表 + + Args: + category: 分类名称 + + Returns: + Dict: 音色字典,key为voice_type,value为音色描述 + """ + return self.TTS_VOICES.get(category, {}) + + def get_all_categories(self) -> list: + """ + 获取所有音色分类 + + Returns: + list: 分类名称列表 + """ + return list(self.TTS_VOICES.keys()) + + def get_all_voices(self) -> Dict[str, Dict[str, str]]: + """ + 获取所有音色分类和音色列表 + + Returns: + Dict: 所有音色分类和音色列表 + """ + return self.TTS_VOICES + + +def main(): + """示例用法""" + # 创建语音合成实例 + tts = ByteDanceTTS() + + # 要转换的文本 + text = """ + 君不见,黄河之水天上来,奔流到海不复回。 + 君不见,高堂明镜悲白发,朝如青丝暮成雪。 + 人生得意须尽欢,莫使金樽空对月。 + 天生我材必有用,千金散尽还复来。 + 烹羊宰牛且为乐,会须一饮三百杯。 + 岑夫子,丹丘生,将进酒,杯莫停。 + 与君歌一曲,请君为我倾耳听。 + 钟鼓馔玉不足贵,但愿长醉不复醒。 + 古来圣贤皆寂寞,惟有饮者留其名。 + 陈王昔时宴平乐,斗酒十千恣欢谑。 + 主人何为言少钱,径须沽取对君酌。 + 五花马,千金裘,呼儿将出换美酒,与尔同销万古愁。 + """ + + # 生成音频并保存 + audio_data = tts.generate_audio( + text=text, + output_path="test_submit.mp3", + voice_type=HS_VOICE_TYPE_QINCANG, + speed_ratio=1.0, + volume_ratio=1.0 + ) + + if audio_data: + # 获取音频信息 + info = tts.get_audio_info(audio_data) + print(f"音频生成成功,大小: {info['size_kb']:.2f} KB") + else: + print("音频生成失败") + + +if __name__ == '__main__': + main() diff --git a/dsLightRag/Test/VideoRetalk.py b/dsLightRag/Util/VideoRetalk.py similarity index 100% rename from dsLightRag/Test/VideoRetalk.py rename to dsLightRag/Util/VideoRetalk.py diff --git a/dsLightRag/Util/__pycache__/GengerateAudio.cpython-310.pyc b/dsLightRag/Util/__pycache__/GengerateAudio.cpython-310.pyc new file mode 100644 index 00000000..cb7f31b0 Binary files /dev/null and b/dsLightRag/Util/__pycache__/GengerateAudio.cpython-310.pyc differ diff --git a/dsLightRag/Util/__pycache__/VideoRetalk.cpython-310.pyc b/dsLightRag/Util/__pycache__/VideoRetalk.cpython-310.pyc new file mode 100644 index 00000000..f01c0387 Binary files /dev/null and b/dsLightRag/Util/__pycache__/VideoRetalk.cpython-310.pyc differ diff --git a/dsLightRag/static/text-to-speech.html b/dsLightRag/static/text-to-speech.html new file mode 100644 index 00000000..fa97a7d0 --- /dev/null +++ b/dsLightRag/static/text-to-speech.html @@ -0,0 +1,539 @@ + + +
+ + +选择音色,输入文本,生成高质量语音
+正在生成语音,请稍候...
+暂无生成结果
+