dify TTS部署 GPT-SoVITS
使用模型供应商:
OpenAI-API-compatible
理由:可直接连接本地部署的TTS服务
参考资料:
1.GPT-SoVITS官方readme
https://github.com/RVC-Boss/GPT-SoVITS/tree/main
2.GPT-sovits-2-OpenAI
https://github.com/RedwindA/GPT-sovits-2-OpenAI/tree/main
两套部署思路:
1.GPT-SoVITS的api_v2.py作为后端代码,GPT-sovits-2-OpenAI作为openai兼容层,这里参考GPT-sovits-2-OpenAI的app.py进行了修改,用于匹配api_v2.py。
from flask import Flask, request, send_file, jsonify
import requests
from pydub import AudioSegment
import io
import os
import json
import yamlapp = Flask(__name__)# Get API_KEY from environment variable
API_KEY = os.environ.get('API_KEY')# Get BACKEND_URL from environment variable or use default
BACKEND_URL = os.environ.get('BACKEND_URL', 'http://127.0.0.1:9880')# Load YAML configuration file
def load_voice_config():try:with open('config.yaml', 'r', encoding='utf-8') as f:config = yaml.safe_load(f)voices = config.get('voices', {})voice_mapping = {voice: voice_data['models']for voice, voice_data in voices.items()}refer_mapping = {voice: voice_data['refer']for voice, voice_data in voices.items()}return voice_mapping, refer_mappingexcept Exception as e:print(f"Error loading config.yaml: {e}")return {}, {}# Replace original environment variable configuration
VOICE_MAPPING, REFER_MAPPING = load_voice_config()# Get other parameters from environment variables or use default values
TEXT_LANGUAGE = os.environ.get('TEXT_LANGUAGE', 'zh')
TOP_K = int(os.environ.get('TOP_K', 15))
TOP_P = float(os.environ.get('TOP_P', 1))
TEMPERATURE = float(os.environ.get('TEMPERATURE', 0.45))
SPEED = float(os.environ.get('SPEED', 0.95))# Print all parameters in one line for debugging
print(f"BACKEND_URL: {BACKEND_URL}, TEXT_LANGUAGE: {TEXT_LANGUAGE}, TOP_K: {TOP_K}, TOP_P: {TOP_P}, TEMPERATURE: {TEMPERATURE}, SPEED: {SPEED}, VOICE_MAPPING: {VOICE_MAPPING}")@app.route('/v1/audio/speech', methods=['POST'])
def convert_tts():# Check API key if it's set in environmentif API_KEY:auth_header = request.headers.get('Authorization')if not auth_header:return "Missing Authorization header", 401# Check if header starts with "Bearer "if not auth_header.startswith('Bearer '):return "Invalid Authorization header format", 401# Extract and verify API keyprovided_key = auth_header.split(' ')[1]if provided_key != API_KEY:return "Invalid API key", 401# Extract 'input' field from OpenAI requestopenai_data = request.jsontext = openai_data.get('input')voice = openai_data.get('voice')# Get model paths and refer from the VOICE_MAPPING according to the provided voicevoice_config = VOICE_MAPPING.get(voice)refer_config = REFER_MAPPING.get(voice)if not voice_config:return f"Voice '{voice}' is not supported", 400gpt_model_path = voice_config.get('gpt_model_path')sovits_model_path = voice_config.get('sovits_model_path')refer_wav_path = refer_config.get('refer_wav_path')prompt_text = refer_config.get('prompt_text')if not gpt_model_path or not sovits_model_path:return f"Model paths for voice '{voice}' are missing", 500if not refer_wav_path or not prompt_text:return f"Refer for voice '{voice}' are missing", 500# Step 1: Set the GPT model in the backendset_gpt_response = requests.get(f"{BACKEND_URL}/set_gpt_weights", params={"weights_path": gpt_model_path})if set_gpt_response.status_code != 200:return f"Backend failed to set GPT model: {set_gpt_response.text}", set_gpt_response.status_code# Step 2: Set the Sovits model in the backendset_sovits_response = requests.get(f"{BACKEND_URL}/set_sovits_weights", params={"weights_path": sovits_model_path})if set_sovits_response.status_code != 200:return f"Backend failed to set Sovits model: {set_sovits_response.text}", set_sovits_response.status_code# Step 3: Send text-to-speech request to the backendbackend_payload = {"text": text,"text_lang": TEXT_LANGUAGE,"ref_audio_path": refer_wav_path,"prompt_text": prompt_text,"prompt_lang": TEXT_LANGUAGE,"top_k": TOP_K,"top_p": TOP_P,"temperature": TEMPERATURE,"speed_factor": SPEED,"text_split_method": "cut5","batch_size": 1,"media_type": "wav"}backend_response = requests.post(f"{BACKEND_URL}/tts", json=backend_payload)# Check if the backend response is successfulif backend_response.status_code != 200:return f"Backend service error: {backend_response.text}", backend_response.status_code# Step 4: Convert returned WAV file to MP3wav_audio = io.BytesIO(backend_response.content)audio = AudioSegment.from_wav(wav_audio)mp3_audio = io.BytesIO()audio.export(mp3_audio, format="mp3")mp3_audio.seek(0)# Return MP3 filereturn send_file(mp3_audio, mimetype='audio/mp3', as_attachment=True, download_name='speech.mp3')if __name__ == '__main__':app.run(host='0.0.0.0', port=5000)
2.第二种其实就是把后端和兼容层合并,避免需要同时部署两个服务
import os
import sys
import traceback
import signal
import argparse
import subprocess
import wave
from typing import Generator
from io import BytesIOimport numpy as np
import soundfile as sf
import yaml
from fastapi import FastAPI, Response, HTTPException, Header
from fastapi.responses import StreamingResponse, JSONResponse
import uvicorn
from pydantic import BaseModel# 添加项目路径
now_dir = os.getcwd()
sys.path.append(now_dir)
sys.path.append("%s/GPT_SoVITS" % (now_dir))from tools.i18n.i18n import I18nAuto
from GPT_SoVITS.TTS_infer_pack.TTS import TTS, TTS_Config
from GPT_SoVITS.TTS_infer_pack.text_segmentation_method import get_method_names as get_cut_method_names# 初始化
i18n = I18nAuto()
cut_method_names = get_cut_method_names()# 命令行参数解析
parser = argparse.ArgumentParser(description="GPT-SoVITS api with OpenAI compatibility")
parser.add_argument("-c", "--tts_config", type=str, default="GPT_SoVITS/configs/tts_infer.yaml", help="tts_infer路径")
parser.add_argument("-a", "--bind_addr", type=str, default="0.0.0.0", help="default: 0.0.0.0")
parser.add_argument("-p", "--port", type=int, default="9880", help="default: 9880")
parser.add_argument("--api_key", type=str, default=None, help="API key for authentication")
parser.add_argument("--config_path", type=str, default="config.yaml", help="Path to voice config YAML file")
args = parser.parse_args()config_path = args.tts_config
port = args.port
host = args.bind_addr
API_KEY = args.api_key
CONFIG_PATH = args.config_path# 加载语音配置
def load_voice_config():try:with open(CONFIG_PATH, 'r', encoding='utf-8') as f:config = yaml.safe_load(f)voices = config.get('voices', {})voice_mapping = {voice: voice_data.get('models', {})for voice, voice_data in voices.items()}refer_mapping = {voice: voice_data.get('refer', {})for voice, voice_data in voices.items()}return voice_mapping, refer_mappingexcept Exception as e:print(f"Error loading config.yaml: {e}")return {}, {}VOICE_MAPPING, REFER_MAPPING = load_voice_config()# 默认参数
TEXT_LANGUAGE = "zh"
TOP_K = 15
TOP_P = 1.0
TEMPERATURE = 0.45
SPEED = 0.95print(f"Voice mapping loaded: {list(VOICE_MAPPING.keys())}")# 初始化 TTS
if config_path in [None, ""]:config_path = "GPT-SoVITS/configs/tts_infer.yaml"tts_config = TTS_Config(config_path)
print(tts_config)
tts_pipeline = TTS(tts_config)# 创建 FastAPI 应用
APP = FastAPI(title="GPT-SoVITS TTS API", description="Text-to-Speech with OpenAI compatibility")# 数据模型
class TTS_Request(BaseModel):text: str = Nonetext_lang: str = Noneref_audio_path: str = Noneaux_ref_audio_paths: list = Noneprompt_lang: str = Noneprompt_text: str = ""top_k: int = 5top_p: float = 1temperature: float = 1text_split_method: str = "cut5"batch_size: int = 1batch_threshold: float = 0.75split_bucket: bool = Truespeed_factor: float = 1.0fragment_interval: float = 0.3seed: int = -1media_type: str = "wav"streaming_mode: bool = Trueparallel_infer: bool = Truerepetition_penalty: float = 1.35sample_steps: int = 32super_sampling: bool = Falseclass OpenAITTSRequest(BaseModel):input: strvoice: strmodel: str = "tts-1"# 音频打包函数
def pack_ogg(io_buffer: BytesIO, data: np.ndarray, rate: int):with sf.SoundFile(io_buffer, mode="w", samplerate=rate, channels=1, format="ogg") as audio_file:audio_file.write(data)return io_bufferdef pack_raw(io_buffer: BytesIO, data: np.ndarray, rate: int):io_buffer.write(data.tobytes())return io_bufferdef pack_wav(io_buffer: BytesIO, data: np.ndarray, rate: int):io_buffer = BytesIO()sf.write(io_buffer, data, rate, format="wav")return io_bufferdef pack_aac(io_buffer: BytesIO, data: np.ndarray, rate: int):process = subprocess.Popen(["ffmpeg","-f", "s16le","-ar", str(rate),"-ac", "1","-i", "pipe:0","-c:a", "aac","-b:a", "192k","-vn","-f", "adts","pipe:1",],stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=subprocess.PIPE,)out, _ = process.communicate(input=data.tobytes())io_buffer.write(out)return io_bufferdef pack_audio(io_buffer: BytesIO, data: np.ndarray, rate: int, media_type: str):if media_type == "ogg":io_buffer = pack_ogg(io_buffer, data, rate)elif media_type == "aac":io_buffer = pack_aac(io_buffer, data, rate)elif media_type == "wav":io_buffer = pack_wav(io_buffer, data, rate)else:io_buffer = pack_raw(io_buffer, data, rate)io_buffer.seek(0)return io_bufferdef wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=32000):wav_buf = BytesIO()with wave.open(wav_buf, "wb") as vfout:vfout.setnchannels(channels)vfout.setsampwidth(sample_width)vfout.setframerate(sample_rate)vfout.writeframes(frame_input)wav_buf.seek(0)return wav_buf.read()# 控制命令处理
def handle_control(command: str):if command == "restart":os.execl(sys.executable, sys.executable, *sys.argv)elif command == "exit":os.kill(os.getpid(), signal.SIGTERM)exit(0)# 参数检查
def check_params(req: dict):text: str = req.get("text", "")text_lang: str = req.get("text_lang", "")ref_audio_path: str = req.get("ref_audio_path", "")streaming_mode: bool = req.get("streaming_mode", False)media_type: str = req.get("media_type", "wav")prompt_lang: str = req.get("prompt_lang", "")text_split_method: str = req.get("text_split_method", "cut5")if ref_audio_path in [None, ""]:return JSONResponse(status_code=400, content={"message": "ref_audio_path is required"})if text in [None, ""]:return JSONResponse(status_code=400, content={"message": "text is required"})if text_lang in [None, ""]:return JSONResponse(status_code=400, content={"message": "text_lang is required"})elif text_lang.lower() not in tts_config.languages:return JSONResponse(status_code=400,content={"message": f"text_lang: {text_lang} is not supported in version {tts_config.version}"},)if prompt_lang in [None, ""]:return JSONResponse(status_code=400, content={"message": "prompt_lang is required"})elif prompt_lang.lower() not in tts_config.languages:return JSONResponse(status_code=400,content={"message": f"prompt_lang: {prompt_lang} is not supported in version {tts_config.version}"},)if media_type not in ["wav", "raw", "ogg", "aac"]:return JSONResponse(status_code=400, content={"message": f"media_type: {media_type} is not supported"})elif media_type == "ogg" and not streaming_mode:return JSONResponse(status_code=400, content={"message": "ogg format is not supported in non-streaming mode"})if text_split_method not in cut_method_names:return JSONResponse(status_code=400, content={"message": f"text_split_method:{text_split_method} is not supported"})return None# API 密钥验证
async def verify_api_key(authorization: str = Header(None)):if API_KEY:if not authorization:raise HTTPException(status_code=401, detail="Missing Authorization header")if not authorization.startswith("Bearer "):raise HTTPException(status_code=401, detail="Invalid Authorization header format")provided_key = authorization.split(" ")[1]if provided_key != API_KEY:raise HTTPException(status_code=401, detail="Invalid API key")# TTS 处理核心函数
async def tts_handle(req: dict):streaming_mode = req.get("streaming_mode", False)return_fragment = req.get("return_fragment", False)media_type = req.get("media_type", "wav")check_res = check_params(req)if check_res is not None:return check_resif streaming_mode or return_fragment:req["return_fragment"] = Truetry:tts_generator = tts_pipeline.run(req)if streaming_mode:def streaming_generator(tts_generator: Generator, media_type: str):if_frist_chunk = Truefor sr, chunk in tts_generator:if if_frist_chunk and media_type == "wav":yield wave_header_chunk(sample_rate=sr)media_type = "raw"if_frist_chunk = Falseyield pack_audio(BytesIO(), chunk, sr, media_type).getvalue()return StreamingResponse(streaming_generator(tts_generator, media_type),media_type=f"audio/{media_type}",)else:sr, audio_data = next(tts_generator)audio_data = pack_audio(BytesIO(), audio_data, sr, media_type).getvalue()return Response(audio_data, media_type=f"audio/{media_type}")except Exception as e:return JSONResponse(status_code=400, content={"message": "tts failed", "Exception": str(e)})# ==================== 路由定义 ====================# 控制端点
@APP.get("/control")
async def control(command: str = None):if command is None:return JSONResponse(status_code=400, content={"message": "command is required"})handle_control(command)# 原生 TTS GET 端点
@APP.get("/tts")
async def tts_get_endpoint(text: str = None,text_lang: str = None,ref_audio_path: str = None,aux_ref_audio_paths: list = None,prompt_lang: str = None,prompt_text: str = "",top_k: int = 5,top_p: float = 1,temperature: float = 1,text_split_method: str = "cut0",batch_size: int = 1,batch_threshold: float = 0.75,split_bucket: bool = True,speed_factor: float = 1.0,fragment_interval: float = 0.3,seed: int = -1,media_type: str = "wav",streaming_mode: bool = False,parallel_infer: bool = True,repetition_penalty: float = 1.35,sample_steps: int = 32,super_sampling: bool = False,
):req = {"text": text,"text_lang": text_lang.lower() if text_lang else "","ref_audio_path": ref_audio_path,"aux_ref_audio_paths": aux_ref_audio_paths,"prompt_text": prompt_text,"prompt_lang": prompt_lang.lower() if prompt_lang else "","top_k": top_k,"top_p": top_p,"temperature": temperature,"text_split_method": text_split_method,"batch_size": int(batch_size),"batch_threshold": float(batch_threshold),"speed_factor": float(speed_factor),"split_bucket": split_bucket,"fragment_interval": fragment_interval,"seed": seed,"media_type": media_type,"streaming_mode": streaming_mode,"parallel_infer": parallel_infer,"repetition_penalty": float(repetition_penalty),"sample_steps": int(sample_steps),"super_sampling": super_sampling,}return await tts_handle(req)# 原生 TTS POST 端点
@APP.post("/tts")
async def tts_post_endpoint(request: TTS_Request):req = request.dict()return await tts_handle(req)# OpenAI 兼容端点
@APP.post("/v1/audio/speech")
async def openai_tts_endpoint(request: OpenAITTSRequest, authorization: str = Header(None)):# 验证 API 密钥await verify_api_key(authorization)text = request.inputvoice = request.voice# 获取语音配置voice_config = VOICE_MAPPING.get(voice)refer_config = REFER_MAPPING.get(voice)if not voice_config:raise HTTPException(status_code=400, detail=f"Voice '{voice}' is not supported")gpt_model_path = voice_config.get('gpt_model_path')sovits_model_path = voice_config.get('sovits_model_path')refer_wav_path = refer_config.get('refer_wav_path')prompt_text = refer_config.get('prompt_text')if not gpt_model_path or not sovits_model_path:raise HTTPException(status_code=500, detail=f"Model paths for voice '{voice}' are missing")if not refer_wav_path or not prompt_text:raise HTTPException(status_code=500, detail=f"Refer for voice '{voice}' are missing")try:# 设置 GPT 模型if gpt_model_path:tts_pipeline.init_t2s_weights(gpt_model_path)# 设置 Sovits 模型if sovits_model_path:tts_pipeline.init_vits_weights(sovits_model_path)# 准备 TTS 请求tts_req = {"text": text,"text_lang": TEXT_LANGUAGE,"ref_audio_path": refer_wav_path,"prompt_text": prompt_text,"prompt_lang": TEXT_LANGUAGE,"top_k": TOP_K,"top_p": TOP_P,"temperature": TEMPERATURE,"speed_factor": SPEED,"text_split_method": "cut5","batch_size": 1,"media_type": "wav","streaming_mode": False}# 执行 TTStts_generator = tts_pipeline.run(tts_req)sr, audio_data = next(tts_generator)# 转换为 MP3mp3_buffer = BytesIO()with sf.SoundFile(mp3_buffer, mode='w', format='mp3', samplerate=sr, channels=1) as f:f.write(audio_data)mp3_buffer.seek(0)return Response(mp3_buffer.getvalue(), media_type="audio/mpeg")except Exception as e:raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")# 设置参考音频
@APP.get("/set_refer_audio")
async def set_refer_aduio(refer_audio_path: str = None):try:tts_pipeline.set_ref_audio(refer_audio_path)except Exception as e:return JSONResponse(status_code=400, content={"message": "set refer audio failed", "Exception": str(e)})return JSONResponse(status_code=200, content={"message": "success"})# 设置 GPT 模型
@APP.get("/set_gpt_weights")
async def set_gpt_weights(weights_path: str = None):try:if weights_path in ["", None]:return JSONResponse(status_code=400, content={"message": "gpt weight path is required"})tts_pipeline.init_t2s_weights(weights_path)except Exception as e:return JSONResponse(status_code=400, content={"message": "change gpt weight failed", "Exception": str(e)})return JSONResponse(status_code=200, content={"message": "success"})# 设置 Sovits 模型
@APP.get("/set_sovits_weights")
async def set_sovits_weights(weights_path: str = None):try:if weights_path in ["", None]:return JSONResponse(status_code=400, content={"message": "sovits weight path is required"})tts_pipeline.init_vits_weights(weights_path)except Exception as e:return JSONResponse(status_code=400, content={"message": "change sovits weight failed", "Exception": str(e)})return JSONResponse(status_code=200, content={"message": "success"})# 健康检查
@APP.get("/health")
async def health_check():return {"status": "healthy", "service": "GPT-SoVITS TTS API"}# 主程序
if __name__ == "__main__":try:if host == "None":host = Noneprint(f"Starting server on {host}:{port}")print(f"Available voices: {list(VOICE_MAPPING.keys())}")uvicorn.run(app=APP, host=host, port=port, workers=1)except Exception:traceback.print_exc()os.kill(os.getpid(), signal.SIGTERM)exit(0)
以上两种,都需要部署一个config.yaml,详细参考GPT-sovits-2-OpenAI
voices:alloy:models:gpt_model_path: "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"sovits_model_path: "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"refer:refer_wav_path: "参考音频"prompt_text: "参考音频对应文本"echo:models:gpt_model_path: "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s1bert25hz-5kh-longer-epoch=12-step=369668.ckpt"sovits_model_path: "GPT_SoVITS/pretrained_models/gsv-v2final-pretrained/s2G2333k.pth"refer:refer_wav_path: "参考音频"prompt_text: "参考音频对应文本"
用于测试的客户端:
from openai import OpenAI
import os# 初始化客户端,指向我们的本地服务
client = OpenAI(base_url="http://xxxx:5000/v1", # 指向服务api_key="your-api-key" # 如果服务端设置了API_KEY
)
# 5000 9880def generate_speech(text, voice="alloy", output_file="speech.mp3"):"""使用OpenAI格式生成语音(兼容最新版本)"""try:# 使用流式响应response = client.audio.speech.create(model="tts-1",voice=voice,input=text,)# 正确的方式:直接写入文件with open(output_file, 'wb') as f:for chunk in response.iter_bytes():f.write(chunk)print(f"语音文件已保存: {output_file}")return Trueexcept Exception as e:print(f"生成语音失败: {e}")return False# 使用示例
if __name__ == "__main__":generate_speech(text="你好,这是一个使用最新OpenAI库的文本转语音测试。",voice="alloy",output_file="latest_speech.mp3")
连接dify
模型名称随便填,api key随便填,API endpoint URL填
http://your_ip:5000/
这里注意不要填v1,后端已经进行了拼接。
完成上述操作,点击完成即可
