菏泽建设网站如何做好品牌宣传
字节火山引擎-大模型声音复刻,流式语音合成接口
-
参考文档:火山引擎-大模型声音复刻文档
-
官网给出的示例代码有bug,这里已经修改了
创建应用
声音复刻大模型页面查看应用,获取接口调用需要的参数
注意调用tts接口时候需要三个参数:
- APP ID => 获取到对应的应用
- Access Token => 鉴权需要
- 声音ID => 语音合成需要
Http流式合成音频接口示例代码
package mainimport ("bytes""encoding/base64""encoding/json""errors""fmt""github.com/gin-gonic/gin""github.com/google/uuid""io/ioutil""net/http""time"
)var (app_id = "8678693223"clusterId = "volcano_icl"SpeakId = "S_v7xollyj1"BearerToken = "-50OZ81pPKpn8pRZEgNrxd0wCELJJMIN"resource_id = "volc.tts_async.emotion"
)
var durationTime time.Duration// TTSServResponse response from backend services
type TTSServResponse struct {ReqID string `json:"reqid"`Code int `json:"code"`Message string `json:"Message"`Operation string `json:"operation"`Sequence int `json:"sequence"`Data string `json:"data"`
}func httpPost(url string, headers map[string]string, body []byte, timeout time.Duration) ([]byte, error) {client := &http.Client{Timeout: timeout,}req, err := http.NewRequest(http.MethodPost, url, bytes.NewBuffer(body))if err != nil {return nil, err}for key, value := range headers {req.Header.Set(key, value)}resp, err := client.Do(req)if err != nil {return nil, err}defer resp.Body.Close()retBody, err := ioutil.ReadAll(resp.Body)if err != nil {return nil, err}return retBody, err
}func synthesis(text string) ([]byte, error) {// 记录合成开始时间startTime := time.Now()reqID := uuid.NewString()params := make(map[string]map[string]interface{})params["app"] = make(map[string]interface{})params["app"]["appid"] = app_idparams["app"]["token"] = "access_token"params["app"]["cluster"] = clusterIdparams["user"] = make(map[string]interface{})params["user"]["uid"] = "uid"params["audio"] = make(map[string]interface{})params["audio"]["voice_type"] = SpeakIdparams["audio"]["encoding"] = "wav"params["audio"]["speed_ratio"] = 1.0params["audio"]["volume_ratio"] = 1.0params["audio"]["pitch_ratio"] = 1.0params["request"] = make(map[string]interface{})params["request"]["reqid"] = reqIDparams["request"]["text"] = textparams["request"]["text_type"] = "plain"params["request"]["operation"] = "query"headers := make(map[string]string)headers["Content-Type"] = "application/json"headers["Authorization"] = fmt.Sprintf("Bearer;%s", BearerToken)url := "https://openspeech.bytedance.com/api/v1/tts"timeo := 30 * time.SecondbodyStr, _ := json.Marshal(params)synResp, err := httpPost(url, headers, []byte(bodyStr), timeo)if err != nil {fmt.Printf("http post fail [err:%s]\n", err.Error())return nil, err}fmt.Printf("resp body:%s\n", synResp)var respJSON TTSServResponseerr = json.Unmarshal(synResp, &respJSON)if err != nil {fmt.Printf("unmarshal response fail [err:%s]\n", err.Error())return nil, err}codeMessages := map[int]string{3001: "无效的请求,请检查参数",3003: "并发超限,请稍后重试",3005: "后端服务忙,请稍后重试",3006: "服务中断,请检查参数",3010: "文本长度超限,请检查文本长度",3011: "无效文本,请检查文本内容",3030: "处理超时,请重试或检查文本",3031: "处理错误,后端出现异常,请重试",3032: "等待获取音频超时,请重试",3040: "后端链路连接错误,请重试",3050: "音色不存在,请检查voice_type代号",}code := respJSON.Codeif code != 3000 {fmt.Printf("code fail [code:%d]\n", code)message, exists := codeMessages[respJSON.Code]if !exists {message = "未知错误,请重试"}return nil, errors.New(message)}audio, _ := base64.StdEncoding.DecodeString(respJSON.Data)// 记录合成结束时间endTime := time.Now()durationTime = endTime.Sub(startTime)// 打印合成时间fmt.Printf("音频合成时间: %s\n", durationTime)return audio, nil
}// Handle TTS synthesis via Gin
func handleTTS(c *gin.Context) {var input map[string]stringif err := c.ShouldBindJSON(&input); err != nil {c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid input"})return}text, exists := input["text"]if !exists || text == "" {c.JSON(http.StatusBadRequest, gin.H{"error": "Text is required"})return}audio, err := synthesis(text)if err != nil {c.JSON(http.StatusInternalServerError, gin.H{"error": "Synthesis failed", "details": err.Error()})return}// Return audio in responsec.Data(http.StatusOK, "audio/wav", audio)
}func main() {r := gin.Default()// POST request to synthesize text to speechr.POST("/synthesize", handleTTS)// Start the serverr.Run(":8080")
}
websocket流式合成音频接口示例代码
package mainimport ("bytes""compress/gzip""encoding/binary""encoding/json""errors""fmt""github.com/gin-gonic/gin""github.com/gorilla/websocket""github.com/satori/go.uuid""io/ioutil""net/http""net/url""time"
)var (enumMessageType = map[byte]string{11: "audio-only server response",12: "frontend server response",15: "error message from server",}enumMessageTypeSpecificFlags = map[byte]string{0: "no sequence number",1: "sequence number > 0",2: "last message from server (seq < 0)",3: "sequence number < 0",}enumMessageSerializationMethods = map[byte]string{0: "no serialization",1: "JSON",15: "custom type",}enumMessageCompression = map[byte]string{0: "no compression",1: "gzip",15: "custom compression method",}
)const (optQuery string = "query"optSubmit string = "submit"
)var duration time.Duration
var addr = "openspeech.bytedance.com"
var ZijieWs_Url = url.URL{Scheme: "wss", Host: addr, Path: "/api/v1/tts/ws_binary"}// 保存appid和token的映射关系
var appTokenMap = map[string]string{"8678693223": "-50OZ81pPKpn8pRZEgNrxd0wCELJJMIN1ww", //胡桃"8724961923": "Ut0tYdEdwVvHgSzeV9gdXEN8EGXSlqBesdd", // 郭德纲"5046524282": "xRcD5NYAo0BMkMajm9JT3XAigKtRUBCOddd", // 钟离
}type synResp struct {Audio []byteIsLast bool
}// version: b0001 (4 bits)
// header size: b0001 (4 bits)
// message type: b0001 (Full client request) (4bits)
// message type specific flags: b0000 (none) (4bits)
// message serialization method: b0001 (JSON) (4 bits)
// message compression: b0001 (gzip) (4bits)
// reserved data: 0x00 (1 byte)
var defaultHeader = []byte{0x11, 0x10, 0x11, 0x00}func setupInput(text, voiceType, appid, opt string) []byte {var err errorreqID := uuid.Must(uuid.NewV4(), err).String()params := make(map[string]map[string]interface{})params["app"] = make(map[string]interface{})//平台上查看具体appidparams["app"]["appid"] = appidparams["app"]["token"] = "access_token"//平台上查看具体集群名称params["app"]["cluster"] = "volcano_icl"params["user"] = make(map[string]interface{})params["user"]["uid"] = "uid"params["audio"] = make(map[string]interface{})params["audio"]["voice_type"] = voiceTypeparams["audio"]["encoding"] = "mp3" // 设为 MP3 格式params["audio"]["sample_rate"] = 24000 // 设为 24kHz 采样率params["audio"]["speed_ratio"] = 1.0params["audio"]["volume_ratio"] = 1.0params["audio"]["pitch_ratio"] = 1.0params["request"] = make(map[string]interface{})params["request"]["reqid"] = reqIDparams["request"]["text"] = textparams["request"]["text_type"] = "plain"params["request"]["operation"] = optresStr, _ := json.Marshal(params)return resStr
}func gzipCompress(input []byte) []byte {var b bytes.Bufferw := gzip.NewWriter(&b)w.Write(input)w.Close()return b.Bytes()
}func gzipDecompress(input []byte) []byte {b := bytes.NewBuffer(input)r, _ := gzip.NewReader(b)out, _ := ioutil.ReadAll(r)r.Close()return out
}func parseResponse(res []byte) (resp synResp, err error) {protoVersion := res[0] >> 4headSize := res[0] & 0x0fmessageType := res[1] >> 4messageTypeSpecificFlags := res[1] & 0x0fserializationMethod := res[2] >> 4messageCompression := res[2] & 0x0freserve := res[3]headerExtensions := res[4 : headSize*4]payload := res[headSize*4:]fmt.Printf(" Protocol version: %x - version %d\n",protoVersion, protoVersion)fmt.Printf(" Header size: %x - %d bytes\n",headSize, headSize*4)fmt.Printf(" Message type: %x - %s\n", messageType,enumMessageType[messageType])fmt.Printf(" Message type specific flags: %x - %s\n", messageTypeSpecificFlags,enumMessageTypeSpecificFlags[messageTypeSpecificFlags])fmt.Printf("Message serialization method: %x - %s\n",serializationMethod, enumMessageSerializationMethods[serializationMethod])fmt.Printf(" Message compression: %x - %s\n",messageCompression, enumMessageCompression[messageCompression])fmt.Printf(" Reserved: %d\n", reserve)if headSize != 1 {fmt.Printf(" Header extensions: %s\n",headerExtensions)}// audio-only server responseif messageType == 0xb {// no sequence number as ACKif messageTypeSpecificFlags == 0 {fmt.Println(" Payload size: 0")} else {sequenceNumber := int32(binary.BigEndian.Uint32(payload[0:4]))payloadSize := int32(binary.BigEndian.Uint32(payload[4:8]))payload = payload[8:]resp.Audio = append(resp.Audio, payload...)fmt.Printf(" Sequence number: %d\n",sequenceNumber)fmt.Printf(" Payload size: %d\n", payloadSize)if sequenceNumber < 0 {resp.IsLast = true}}} else if messageType == 0xf {code := int32(binary.BigEndian.Uint32(payload[0:4]))errMsg := payload[8:]if messageCompression == 1 {errMsg = gzipDecompress(errMsg)}fmt.Printf(" Error code: %d\n", code)fmt.Printf(" Error msg: %s\n", string(errMsg))err = errors.New(string(errMsg))return} else if messageType == 0xc {var msgSize int32msgSize = int32(binary.BigEndian.Uint32(payload[0:4]))fmt.Println(msgSize)payload = payload[4:]if messageCompression == 1 {payload = gzipDecompress(payload)}fmt.Printf(" Frontend message: %s\n", string(payload))} else {fmt.Printf(" wrong message type:%d\n", messageType)err = errors.New("wrong message type")return}return
}// 流式合成
func DouBaoAudioStreamSynth(text, voiceType, appid string) ([]byte, time.Duration, error) {// 记录合成开始时间startTime := time.Now()// 从appTokenMap获取对应的tokentoken, exists := appTokenMap[appid]if !exists {return nil, 0, errors.New("invalid appid")}//鉴权使用var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}}input := setupInput(text, voiceType, appid, optSubmit)input = gzipCompress(input)payloadSize := len(input)payloadArr := make([]byte, 4)binary.BigEndian.PutUint32(payloadArr, uint32(payloadSize))clientRequest := append(defaultHeader, payloadArr...)clientRequest = append(clientRequest, input...)// websocket连接到字节服务器c, _, err := websocket.DefaultDialer.Dial(ZijieWs_Url.String(), header)if err != nil {return nil, 0, err}defer c.Close()//c.WriteMessage(websocket.TextMessage, []byte("已连接至字节服务器..."))// 连接成功后提醒fmt.Println("Successfully connected to the byte server.")err = c.WriteMessage(websocket.BinaryMessage, clientRequest)if err != nil {return nil, 0, err}var audio []bytefor {_, message, err := c.ReadMessage()if err != nil {break}resp, err := parseResponse(message)if err != nil {break}audio = append(audio, resp.Audio...)if resp.IsLast {break}}// 记录合成结束时间endTime := time.Now()duration = endTime.Sub(startTime)// 打印合成时间fmt.Printf("音频合成时间: %s\n", duration)return audio, duration, nil
}// WebSocket流式合成
func WsStreamSynth2(c *gin.Context) {//创建websocket连接conn, err := (&websocket.Upgrader{CheckOrigin: func(r *http.Request) bool {return true}}).Upgrade(c.Writer, c.Request, nil)if err != nil {http.NotFound(c.Writer, c.Request)return}defer conn.Close()appid := c.Query("appid")voiceType := c.Query("voiceType")//appid := c.Query("appid")// 从appTokenMap获取对应的token//token, exists := appTokenMap[appid]//if !exists {// conn.WriteMessage(websocket.TextMessage, []byte("无效的 appid"))// return//}// 设置请求头//var header = http.Header{"Authorization": []string{fmt.Sprintf("Bearer;%s", token)}}conn.WriteMessage(websocket.TextMessage, []byte("连接已建立"))for {type RequestData struct {Text string `json:"text"`}var requestData RequestDataerr = conn.ReadJSON(&requestData)if err != nil {conn.WriteMessage(websocket.TextMessage, []byte("Invalid JSON format"))break//return}text := requestData.Text//voiceType := c.DefaultPostForm("voiceType", "S_v7xollyj1")// 从voiceTypeNameMap获取对应的声线//voiceType, exists := voiceTypeNameMap[voiceTypeName]//if !exists {// conn.WriteMessage(websocket.TextMessage, []byte("声线类型选择失败"))// return//}conn.WriteMessage(websocket.TextMessage, []byte("开始处理合成音频"))audio, duration, err := DouBaoAudioStreamSynth(text, voiceType, appid)if err != nil {conn.WriteMessage(websocket.TextMessage, []byte("生成音频失败"))break//return}// 将 time.Duration 转换为字符串durationStr := duration.String()// Send the audio back to the clientconn.WriteMessage(websocket.BinaryMessage, audio) // 发送二进制音频数据conn.WriteMessage(websocket.TextMessage, []byte("本次合成时间: "+durationStr))conn.WriteMessage(websocket.TextMessage, []byte("继续监听..."))}// You can keep the WebSocket open for further communication if needed:// For example, waiting for further synthesis requests or other commands.// Continue waiting for further requests or close after some time if no requests come.//for {// _, msg, err := conn.ReadMessage()// if err != nil {// // Handle connection close or error// break// }//// // Handle any incoming messages here if needed (optional)// // For instance, send a "ping" to keep the connection alive, or re-trigger synthesis.// conn.WriteMessage(websocket.TextMessage, []byte("继续监听..."))//}}func main() {// 初始化 Gin 路由r := gin.Default()// WebSocket 请求处理r.GET("/WsStreamSynth/ws", WsStreamSynth2)// 启动服务器r.Run(":8081")
}