当前位置：首页 > news >正文

使用Azure OpenAI Realtime模型实现语音助理

news 2025/9/20 5:40:22

微软Azure云目前也推出了OpenAI Realtime系列多模态大模型，由于之前使用过Azure云，因此就没有使用OpenAI而是用它提供的模型进行部署测试。

在多模态大模型出现前，我们都是使用语音转文字（STT）+大模型+文字转语音（TTS）的方式间接实现语音输入输出。它和Realtime模型相比主要存在下面几点问题：

延迟问题

语音转文字（STT）和大模型处理需要分步执行，每一步都会引入延迟。

错误累积风险

STT的识别错误会直接影响大模型的输入质量，错误可能被放大。

上下文连贯性

分步处理可能导致语音转文字后的文本丢失语调、停顿等非语言信息，影响大模型对上下文的理解。

实现复杂度

STT+大模型+TTS需要集成多个独立模块，调试和维护成本较高。

成本与资源占用

分步方案需部署多个模型，计算资源和API调用成本更高。

语音交互自然度

TTS生成的语音可能缺乏情感变化，与真人对话体验存在差距。

多语言支持

独立STT/TTS模块可能对不同语言的支持能力不均衡，需单独训练。

因此可以看出，Realtime这种多模态大模型确实可以优化和解决很多问题，使整体体验得到很大的提升。

Azure云目前提供了WebRtc和WebSocket两种API调用方式用于和模型进行语音通信。我以官方给的WebRtc方式demo为例，实现了一个语音助手，其中包括指令设置以及tools的function使用，演示了如何触发tools指定的function，以此实现RAG以及更多功能。

需要配置Azure云上部署的模型参数才能使用，在Azure云上部署完模型就能拿到，参数包括：

SESSIONS_URL
API_KEY
DEPLOYMENT

WebRtc和服务器的事件交互式是通过DataChannel实现的，其中比较关键的是session.update消息。我们的提示词是通过它的instructions参数设置给模型的，另外它的tools参数是用来配置相关function的，需要注意的是tools目前还不支持mcp服务

<!DOCTYPE html>
<html>
<head><meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Azure OpenAI Realtime Session</title>
</head>
<body><h1>Azure OpenAI Realtime Session</h1><p>WARNING: Don't use this code sample in production with the API key hardcoded. Use a protected backend service to call the sessions API and generate the ephemeral key. Then return the ephemeral key to the client.</p><button onclick="StartSession()">点击开始聊天</button><!-- Log container for API messages --><div id="logContainer"></div> <script>// Make sure the WebRTC URL region matches the region of your Azure OpenAI resource.// For example, if your Azure OpenAI resource is in the swedencentral region,// the WebRTC URL should be https://swedencentral.realtimeapi-preview.ai.azure.com/v1/realtimertc.// If your Azure OpenAI resource is in the eastus2 region, the WebRTC URL should be https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc.const WEBRTC_URL= "https://eastus2.realtimeapi-preview.ai.azure.com/v1/realtimertc"// The SESSIONS_URL includes the Azure OpenAI resource URL,// deployment name, the /realtime/sessions path, and the API version.// The Azure OpenAI resource region isn't part of the SESSIONS_URL.const SESSIONS_URL="https://xxxxxxxxxx.openai.azure.com/openai/realtimeapi/sessions?api-version=2025-04-01-preview"// The API key of the Azure OpenAI resource.const API_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"; // The deployment name might not be the same as the model name.const DEPLOYMENT = "xxxxxxxxxxxxxxx"const VOICE = "sage"async function StartSession() {try {// WARNING: Don't use this code sample in production// with the API key hardcoded. // Use a protected backend service to call the // sessions API and generate the ephemeral key.// Then return the ephemeral key to the client.const response = await fetch(SESSIONS_URL, {method: "POST",headers: {//"Authorization": `Bearer ${ACCESS_TOKEN}`,"api-key": API_KEY,"Content-Type": "application/json"},body: JSON.stringify({model: DEPLOYMENT,voice: VOICE})});if (!response.ok) {throw new Error(`API request failed`);}const data = await response.json();const sessionId = data.id;const ephemeralKey = data.client_secret?.value; console.error("Ephemeral key:", ephemeralKey);// Mask the ephemeral key in the log message.logMessage("Ephemeral Key Received: " + "***");logMessage("WebRTC Session Id = " + sessionId );// Set up the WebRTC connection using the ephemeral key.init(ephemeralKey); } catch (error) {console.error("Error fetching ephemeral key:", error);logMessage("Error fetching ephemeral key: " + error.message);}}function createMeeting(args) {alert("触发了我配置的工具函数 【createMeeting】！您预约的会议[" + args.title + "]在： " + args.start_time + " 召开，持续时长：" + args.duration + "分钟");return true;}async function init(ephemeralKey) {let peerConnection = new RTCPeerConnection();// Set up to play remote audio from the model.const audioElement = document.createElement('audio');audioElement.autoplay = true;document.body.appendChild(audioElement);peerConnection.ontrack = (event) => {audioElement.srcObject = event.streams[0];};// Set up data channel for sending and receiving eventsconst clientMedia = await navigator.mediaDevices.getUserMedia({ audio: true });const audioTrack = clientMedia.getAudioTracks()[0];peerConnection.addTrack(audioTrack);const dataChannel = peerConnection.createDataChannel('realtime-channel');dataChannel.addEventListener('open', () => {logMessage('Data channel is open');updateSession(dataChannel);responseCreate(dataChannel);//conversationCreate(dataChannel);});dataChannel.addEventListener('message', (event) => {const realtimeEvent = JSON.parse(event.data); console.log(realtimeEvent); logMessage("Received server event: " + JSON.stringify(realtimeEvent, null, 2));if (realtimeEvent.type === "session.update") {const instructions = realtimeEvent.session.instructions;logMessage("Instructions: " + instructions);} else if (realtimeEvent.type === "session.error") {logMessage("Error: " + realtimeEvent.error.message);} else if (realtimeEvent.type === "session.end") {logMessage("Session ended.");} else if (realtimeEvent.type === "response.output_item.done") {const item = realtimeEvent.item;if (item.type == "function_call") {try {const args = JSON.parse(item.arguments);if (item.name == "createMeeting") {let result = createMeeting(args);if (result) {conversationCreate(dataChannel, "到这边说明会议已经创建成功了！请进行相应回复");responseCreate(dataChannel);}}} catch(e) {logMessage("createMeeting error: " + e);return;}}}});dataChannel.addEventListener('close', () => {logMessage('Data channel is closed');});// Start the session using the Session Description Protocol (SDP)const offer = await peerConnection.createOffer();await peerConnection.setLocalDescription(offer);const sdpResponse = await fetch(`${WEBRTC_URL}?model=${DEPLOYMENT}`, {method: "POST",body: offer.sdp,headers: {Authorization: `Bearer ${ephemeralKey}`,"Content-Type": "application/sdp",},});const answer = { type: "answer", sdp: await sdpResponse.text() };await peerConnection.setRemoteDescription(answer);const button = document.createElement('button');button.innerText = 'Close Session';button.onclick = stopSession;document.body.appendChild(button);// Send a client event to update the sessionfunction updateSession(dataChannel) {const event = {type: "session.update",session: {instructions: "你是一名视频会议助理，请用亲切的语气为客户提供服务。你的开场招呼语是：我是您的会议助理小爱！请问有什么能够帮助您的么？\n##注意：\n#你目前只能提供创建会议的服务，并且需要客户提供开始时间、持续时长、会议名称。\n#在正式创建会议前，请再确认一下客户提供的信息是否修改，不需要则开始执行createMeeting函数。","turn_detection": {"type": "server_vad","threshold": 0.5,"prefix_padding_ms": 300,"silence_duration_ms": 200,"create_response": true},tools: [/*{"type": "mcp","server_label": "dmcp","server_url": "https://dmcp-server.deno.dev/sse","require_approval": "never"},*/{"type": "function","name": "queryKnowledgeBase","description": "查询内部知识库，获取与用户问题相关的信息。","parameters": {"type": "object","properties": {"question": {"type": "string","description": "用户的问题"}},"required": ["question"]}},{"type": "function","name": "createMeeting","description": "创建一场指定开始时间、会议时长以及名称的会议，用于后续开视频会议。","parameters": {"type": "object","properties": {"start_time": {"type": "string","description": "会议的开始时间，格式为：（年-月-日 小时:分钟:00） 例：2025-09-09 15:33:00"},"duration": {"type": "integer","description": "会议的时长，格式为：数字（分钟） 例：120"},"title": {"type": "string","description": "会议名称，默认名称：我的会议"},},"required": ["start_time", "duration", "title"]}}],tool_choice: "auto"}};dataChannel.send(JSON.stringify(event));logMessage("Sent client event: " + JSON.stringify(event, null, 2));}function conversationCreate(dataChannel, str) {const event = {type: "conversation.item.create",item: {type: "message",role: "user",content: [{ type: "input_text", text: str}],}};dataChannel.send(JSON.stringify(event));logMessage("Sent client event: " + JSON.stringify(event, null, 2));}function responseCreate(dataChannel) {const event = {type: "response.create"};dataChannel.send(JSON.stringify(event));logMessage("Sent client event: " + JSON.stringify(event, null, 2));}function stopSession() {if (dataChannel) dataChannel.close();if (peerConnection) peerConnection.close();peerConnection = null;logMessage("Session closed.");}}function logMessage(message) {const logContainer = document.getElementById("logContainer");const p = document.createElement("p");p.textContent = message;logContainer.appendChild(p);window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });}</script>
</body>
</html>

查看全文

http://www.dtcms.com/a/390441.html