dsProject/dsLightRag/Util/GengerateAudio.py

#coding=utf-8

'''
字节跳动语音合成API封装类
requires Python 3.6 or later
pip install requests
'''
import base64
import json
import uuid
import requests
from typing import Optional, Dict, Any
from pathlib import Path

from Config.Config import HS_APP_ID, HS_ACCESS_TOKEN, HS_CLUSTER_ID, HS_VOICE_TYPE_QINCANG


# 在ByteDanceTTS类中添加以下音色分类字典

class ByteDanceTTS:
    """
    字节跳动语音合成API封装类
    提供文本转语音功能
    """
    
    # 音色分类字典
    TTS_VOICES = {
        "通用场景": {
            "BV700_V2_streaming": "灿灿 2.0",
            "BV705_streaming": "炀炀",
            "BV701_V2_streaming": "擎苍 2.0",
            "BV001_V2_streaming": "通用女声 2.0",
            "BV700_streaming": "灿灿",
            "BV406_V2_streaming": "超自然音色-梓梓2.0",
            "BV406_streaming": "超自然音色-梓梓",
            "BV407_V2_streaming": "超自然音色-燃燃2.0",
            "BV407_streaming": "超自然音色-燃燃",
            "BV001_streaming": "通用女声（12种情感）",
            "BV002_streaming": "通用男声"
        },
        "有声阅读": {
            "BV701_streaming": "擎苍",
            "BV123_streaming": "阳光青年",
            "BV120_streaming": "反卷青年",
            "BV119_streaming": "通用赘婿",
            "BV115_streaming": "古风少御",
            "BV107_streaming": "霸气青叔",
            "BV100_streaming": "质朴青年",
            "BV104_streaming": "温柔淑女",
            "BV004_streaming": "开朗青年",
            "BV113_streaming": "甜宠少御",
            "BV102_streaming": "儒雅青年"
        },
        "智能助手": {
            "BV405_streaming": "甜美小源",
            "BV007_streaming": "亲切女声",
            "BV009_streaming": "知性女声",
            "BV419_streaming": "诚诚",
            "BV415_streaming": "童童",
            "BV008_streaming": "亲切男声"
        },
        "视频配音": {
            "BV408_streaming": "译制片男声",
            "BV426_streaming": "懒小羊",
            "BV428_streaming": "清新文艺女声",
            "BV403_streaming": "鸡汤女声",
            "BV158_streaming": "智慧老者",
            "BV157_streaming": "慈爱姥姥",
            "BR001_streaming": "说唱小哥",
            "BV410_streaming": "活力解说男",
            "BV411_streaming": "影视解说小帅",
            "BV437_streaming": "解说小帅-多情感",
            "BV412_streaming": "影视解说小美",
            "BV159_streaming": "纨绔青年",
            "BV418_streaming": "直播一姐",
            "BV142_streaming": "沉稳解说男",
            "BV143_streaming": "潇洒青年",
            "BV056_streaming": "阳光男声",
            "BV005_streaming": "活泼女声",
            "BV064_streaming": "小萝莉"
        },
        "特色音色": {
            "BV051_streaming": "奶气萌娃",
            "BV063_streaming": "动漫海绵",
            "BV417_streaming": "动漫海星",
            "BV050_streaming": "动漫小新",
            "BV061_streaming": "天才童声"
        },
        "广告配音": {
            "BV401_streaming": "促销男声",
            "BV402_streaming": "促销女声",
            "BV006_streaming": "磁性男声"
        },
        "新闻播报": {
            "BV011_streaming": "新闻女声",
            "BV012_streaming": "新闻男声"
        },
        "教育场景": {
            "BV034_streaming": "知性姐姐-双语",
            "BV033_streaming": "温柔小哥"
        }
    }
    
    def __init__(self, 
                 app_id: Optional[str] = None,
                 access_token: Optional[str] = None,
                 cluster_id: Optional[str] = None,
                 voice_type: Optional[str] = None):
        """
        初始化语音合成类
        
        Args:
            app_id: 应用ID，默认为Config中的HS_APP_ID
            access_token: 访问令牌，默认为Config中的HS_ACCESS_TOKEN
            cluster_id: 集群ID，默认为Config中的HS_CLUSTER_ID
            voice_type: 声音类型，默认为Config中的HS_VOICE_TYPE_QINCANG
        """
        self.app_id = app_id or HS_APP_ID
        self.access_token = access_token or HS_ACCESS_TOKEN
        self.cluster_id = cluster_id or HS_CLUSTER_ID
        self.voice_type = voice_type or HS_VOICE_TYPE_QINCANG
        
        self.host = "openspeech.bytedance.com"
        self.api_url = f"https://{self.host}/api/v1/tts"
        self.header = {"Authorization": f"Bearer;{self.access_token}"}
    
    def generate_audio(self, 
                       text: str, 
                       output_path: Optional[str] = None,
                       voice_type: Optional[str] = None,
                       encoding: str = "mp3",
                       speed_ratio: float = 1.0,
                       volume_ratio: float = 1.0,
                       pitch_ratio: float = 1.0,
                       text_type: str = "plain",
                       operation: str = "query") -> Optional[bytes]:
        """
        生成语音音频
        
        Args:
            text: 要转换的文本内容
            output_path: 输出文件路径，如果提供则保存为文件
            voice_type: 声音类型，覆盖初始化设置
            encoding: 音频编码格式，默认mp3
            speed_ratio: 语速比例，默认1.0
            volume_ratio: 音量比例，默认1.0
            pitch_ratio: 音调比例，默认1.0
            text_type: 文本类型，默认plain
            operation: 操作类型，默认query
            
        Returns:
            bytes: 音频二进制数据，失败返回None
        """
        # 构建请求JSON
        request_json = {
            "app": {
                "appid": self.app_id,
                "token": "access_token",
                "cluster": self.cluster_id
            },
            "user": {
                "uid": str(uuid.uuid4())  # 使用随机用户ID
            },
            "audio": {
                "voice_type": voice_type or self.voice_type,
                "encoding": encoding,
                "speed_ratio": speed_ratio,
                "volume_ratio": volume_ratio,
                "pitch_ratio": pitch_ratio,
            },
            "request": {
                "reqid": str(uuid.uuid4()),
                "text": text,
                "text_type": text_type,
                "operation": operation,
                "with_frontend": 1,
                "frontend_type": "unitTson"
            }
        }
        
        try:
            # 发送请求
            resp = requests.post(self.api_url, json.dumps(request_json), headers=self.header)
            resp.raise_for_status()
            
            resp_data = resp.json()
            
            if "data" in resp_data:
                audio_data = base64.b64decode(resp_data["data"])
                
                # 如果提供了输出路径，保存文件
                if output_path:
                    self.save_audio(audio_data, output_path)
                
                return audio_data
            else:
                print(f"API响应中未包含音频数据: {resp_data}")
                return None
                
        except requests.exceptions.RequestException as e:
            print(f"请求失败: {e}")
            return None
        except Exception as e:
            print(f"生成音频失败: {e}")
            return None
    
    def save_audio(self, audio_data: bytes, output_path: str) -> bool:
        """
        保存音频数据到文件
        
        Args:
            audio_data: 音频二进制数据
            output_path: 输出文件路径
            
        Returns:
            bool: 保存是否成功
        """
        try:
            # 确保目录存在
            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
            
            with open(output_path, "wb") as f:
                f.write(audio_data)
            
            print(f"音频已保存到: {output_path}")
            return True
            
        except Exception as e:
            print(f"保存音频失败: {e}")
            return False
    
    def get_audio_info(self, audio_data: bytes) -> Dict[str, Any]:
        """
        获取音频信息
        
        Args:
            audio_data: 音频二进制数据
            
        Returns:
            Dict: 包含音频大小和格式的信息
        """
        return {
            "size_bytes": len(audio_data),
            "size_kb": len(audio_data) / 1024,
            "format": "mp3"  # 目前固定为mp3格式
        }

    def get_voices_by_category(self, category: str) -> Dict[str, str]:
        """
        根据分类获取音色列表
        
        Args:
            category: 分类名称
            
        Returns:
            Dict: 音色字典，key为voice_type，value为音色描述
        """
        return self.TTS_VOICES.get(category, {})
    
    def get_all_categories(self) -> list:
        """
        获取所有音色分类
        
        Returns:
            list: 分类名称列表
        """
        return list(self.TTS_VOICES.keys())
    
    def get_all_voices(self) -> Dict[str, Dict[str, str]]:
        """
        获取所有音色分类和音色列表
        
        Returns:
            Dict: 所有音色分类和音色列表
        """
        return self.TTS_VOICES


def main():
    """示例用法"""
    # 创建语音合成实例
    tts = ByteDanceTTS()
    
    # 要转换的文本
    text = """
    君不见，黄河之水天上来，奔流到海不复回。
    君不见，高堂明镜悲白发，朝如青丝暮成雪。
    人生得意须尽欢，莫使金樽空对月。
    天生我材必有用，千金散尽还复来。
    烹羊宰牛且为乐，会须一饮三百杯。
    岑夫子，丹丘生，将进酒，杯莫停。
    与君歌一曲，请君为我倾耳听。
    钟鼓馔玉不足贵，但愿长醉不复醒。
    古来圣贤皆寂寞，惟有饮者留其名。
    陈王昔时宴平乐，斗酒十千恣欢谑。
    主人何为言少钱，径须沽取对君酌。
    五花马，千金裘，呼儿将出换美酒，与尔同销万古愁。
    """
    
    # 生成音频并保存
    audio_data = tts.generate_audio(
        text=text,
        output_path="test_submit.mp3",
        voice_type=HS_VOICE_TYPE_QINCANG,
        speed_ratio=1.0,
        volume_ratio=1.0
    )
    
    if audio_data:
        # 获取音频信息
        info = tts.get_audio_info(audio_data)
        print(f"音频生成成功，大小: {info['size_kb']:.2f} KB")
    else:
        print("音频生成失败")


if __name__ == '__main__':
    main()