模型协同构建智能流程体
1.前言
在人工智能技术向纵深发展的当下,单一模型已难以满足复杂场景的智能化需求。通过创新设计的异构模型协同架构,能够有机整合不同功能模块的技术优势,构建具备动态任务解析能力的智能流程体。例如医疗领域整合医学影像分析、电子病历解析、病理语音记录处理,可以实现"问诊-检查-诊断-方案"全流程智能化。
2.实践
在我们日常的工作中其实每一个完整的工作场景都可以被分解为不同的片段或具体的单一“动作”,将每一个片段进行功能化后再流程化的组合,就完成了一个工作场景。我们可以通过设计不同的提示词来引导大模型进行对话,来完成关键信息的输入、输出。然后通过辅助一些特定的模型将整体的流程场景化。
本文通过整合语音交互(ASR/TTS)、语义理解(NLP)、智能对话(LLM)、语音通话等核心组件构建来访人员登记预约的流程体试例。有同事反馈前面的例子总是用python实现是否可以选择Java,针对团队前期开发中反馈的技术选型倾向,本方案选择Java作为主要实现语言。
1)语音采集识别模型集成
package com.XXX;
import javax.sound.sampled.*;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import com.alibaba.dashscope.audio.asr.recognition.Recognition;
import com.alibaba.dashscope.audio.asr.recognition.RecognitionParam;
import com.google.gson.*;public class Recognize3 {// 录音参数配置private static final int SAMPLE_RATE = 16000; // 16kHz采样率private static final int SAMPLE_SIZE = 16; // 16bit采样位数private static final int CHANNELS = 1; // 单声道private static final boolean SIGNED = true; // 有符号采样private static final boolean BIG_ENDIAN = false;// 小端模式private static final String MODEL = "paraformer-realtime-v2";private static final String APIKEY = "XXXXXXX";public static String steamRecognize(){try {// 1. 录制音频byte[] audioData = recordAudio(10); // 录制10秒// 2. 保存为PCM文件saveAsPCM(audioData, "recording.pcm");System.out.println("录音完成");} catch (Exception e) {e.printStackTrace();}// 创建Recognition实例Recognition recognizer = new Recognition();// 创建RecognitionParamRecognitionParam param =RecognitionParam.builder()// 若没有将API Key配置到环境变量中,需将下面这行代码注释放开,并将apiKey替换为自己的API Key.apiKey(APIKEY).model(MODEL).format("pcm").sampleRate(16000)// “language_hints”只支持paraformer-v2和paraformer-realtime-v2模型.parameter("language_hints", new String[]{"zh"}).build();// 记录原始响应String jsonResult = recognizer.call(param, new File("recording.pcm"));return parseRecognizerResult(jsonResult);}/*** 执行音频录制* @param seconds 录音时长(秒)*/private static byte[] recordAudio(int seconds)throws LineUnavailableException, InterruptedException {AudioFormat format = new AudioFormat(SAMPLE_RATE, SAMPLE_SIZE, CHANNELS, SIGNED, BIG_ENDIAN);TargetDataLine line = AudioSystem.getTargetDataLine(format);line.open(format);line.start();ByteArrayOutputStream out = new ByteArrayOutputStream();byte[] buffer = new byte[4096];System.out.println("开始录音...");long endTime = System.currentTimeMillis() + seconds * 1000;while (System.currentTimeMillis() < endTime) {int bytesRead = line.read(buffer, 0, buffer.length);if (bytesRead > 0) {out.write(buffer, 0, bytesRead);}}line.stop();line.close();System.out.println("录音结束");return out.toByteArray();}/*** 保存为原始PCM文件*/private static void saveAsPCM(byte[] audioData, String filename)throws IOException {File targetFile = new File(filename);// 检查并删除已存在的文件if (targetFile.exists()) {if (!targetFile.delete()) {throw new IOException("Failed to delete existing file: " + filename);}}try (FileOutputStream fos = new FileOutputStream(targetFile)) {fos.write(audioData);}}public static String parseRecognizerResult(String jsonResult) {//System.out.println(jsonResult);String result = "转换失败";try {// 空响应检查if (jsonResult == null) {return result;}if (jsonResult.isEmpty()) {return result;}// 解析JSON结构JsonObject jsonObject = JsonParser.parseString(jsonResult).getAsJsonObject();// 提取sentences对象if (!jsonObject.has("sentences")) {return result;}JsonObject sentences = jsonObject.get("sentences").getAsJsonArray().get(0).getAsJsonObject();// 提取text内容if (!sentences.has("text")) {return result;}return sentences.get("text").getAsString();} catch (Exception e) {e.printStackTrace();}return result;}}
2).语音合成模型集成
package com.XXX;
// DashScope SDK 版本需要不低于 2.19.0
import com.alibaba.dashscope.aigc.multimodalconversation.AudioParameters;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversation;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationParam;
import com.alibaba.dashscope.aigc.multimodalconversation.MultiModalConversationResult;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import io.reactivex.Flowable;
import javax.sound.sampled.*;
import java.util.Base64;public class Speak {private static final String MODEL = "qwen-tts";private static final String APIKEY = "XXXXX";private static final String TEST = "XXXXX!";public static void streamSpeak(String text) throws ApiException, NoApiKeyException, UploadFileException {MultiModalConversation conv = new MultiModalConversation();MultiModalConversationParam param = MultiModalConversationParam.builder().model(MODEL).text(text).voice(AudioParameters.Voice.CHERRY).apiKey(APIKEY).build();Flowable<MultiModalConversationResult> result = conv.streamCall(param);result.blockingForEach(Speak::accept);}private static void accept(MultiModalConversationResult r) {try {// 1. 获取Base64编码的音频数据String base64Data = r.getOutput().getAudio().getData();byte[] audioBytes = Base64.getDecoder().decode(base64Data);// 2. 配置音频格式(根据API返回的音频格式调整)AudioFormat format = new AudioFormat(AudioFormat.Encoding.PCM_SIGNED,24000, // 采样率(需与API返回格式一致)16, // 采样位数1, // 声道数2, // 帧大小(位数/字节数)16000, // 数据传输率false // 是否压缩);// 3. 实时播放音频数据DataLine.Info info = new DataLine.Info(SourceDataLine.class, format);try (SourceDataLine line = (SourceDataLine) AudioSystem.getLine(info)) {if (line != null) {line.open(format);line.start();line.write(audioBytes, 0, audioBytes.length);line.drain();}}} catch (LineUnavailableException e) {e.printStackTrace();}}public static void main(String[] args) {try {streamSpeak(TEST);} catch (ApiException | NoApiKeyException | UploadFileException e) {System.out.println(e.getMessage());}System.exit(0);}
}
3).对话大模型集成
package com.XXXX;
import java.util.ArrayList;
import java.util.List;
import com.alibaba.dashscope.aigc.generation.Generation;
import com.alibaba.dashscope.aigc.generation.GenerationParam;
import com.alibaba.dashscope.aigc.generation.GenerationResult;
import com.alibaba.dashscope.common.Message;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.InputRequiredException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import java.util.Scanner;public class Deepseek {private static final String MODEL = "deepseek-r1";private static final String APIKEY = "XXXXXXXXX";private static final String TEST = "XXXXXXXXXX";public static GenerationParam createGenerationParam(List<Message> messages) {return GenerationParam.builder()// 若没有配置环境变量,请用阿里云百炼API Key将下行替换为:.apiKey("sk-xxx").apiKey(APIKEY).model(MODEL).messages(messages).resultFormat(GenerationParam.ResultFormat.MESSAGE).build();}public static GenerationResult callGenerationWithMessages(GenerationParam param) throws ApiException, NoApiKeyException, InputRequiredException {Generation gen = new Generation();return gen.call(param);}public static Message createMessage(Role role, String content) {return Message.builder().role(role.getValue()).content(content).build();}public static String deepSeek(List<Message> messages,String roleString,Role role) throws NoApiKeyException, InputRequiredException {messages.add(Deepseek.createMessage(role, roleString));GenerationParam param = createGenerationParam(messages);GenerationResult result = callGenerationWithMessages(param);messages.add(result.getOutput().getChoices().get(0).getMessage());roleString = result.getOutput().getChoices().get(0).getMessage().getContent();//System.out.println(messages.toString());return roleString;}public static void main(String[] args) {try {List<Message> messages = new ArrayList<>();messages.add(createMessage(Role.SYSTEM, TEST));for (int i = 0; i < 3;i++) {Scanner scanner = new Scanner(System.in);System.out.print("请输入:");String userInput = scanner.nextLine();if ("exit".equalsIgnoreCase(userInput)) {break;}deepSeek(messages,userInput,Role.USER);}} catch (ApiException | NoApiKeyException | InputRequiredException e) {e.printStackTrace();}System.exit(0);}
}
4)语音通知集成
package com.XXXXX;
import com.aliyun.auth.credentials.Credential;
import com.aliyun.auth.credentials.provider.StaticCredentialProvider;
import com.aliyun.core.http.HttpClient;
import com.aliyun.core.http.HttpMethod;
import com.aliyun.core.http.ProxyOptions;
import com.aliyun.httpcomponent.httpclient.ApacheAsyncHttpClientBuilder;
import com.aliyun.sdk.service.dyvmsapi20170525.models.*;
import com.aliyun.sdk.service.dyvmsapi20170525.*;
import com.google.gson.Gson;
import darabonba.core.RequestConfiguration;
import darabonba.core.client.ClientOverrideConfiguration;
import darabonba.core.utils.CommonUtil;
import darabonba.core.TeaPair;//import javax.net.ssl.KeyManager;
//import javax.net.ssl.X509TrustManager;
import java.net.InetSocketAddress;
import java.time.Duration;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.io.*;
public class Call {public static void callPhone(String phone,String name) throws Exception {// HttpClient Configuration/*HttpClient httpClient = new ApacheAsyncHttpClientBuilder().connectionTimeout(Duration.ofSeconds(10)) // Set the connection timeout time, the default is 10 seconds.responseTimeout(Duration.ofSeconds(10)) // Set the response timeout time, the default is 20 seconds.maxConnections(128) // Set the connection pool size.maxIdleTimeOut(Duration.ofSeconds(50)) // Set the connection pool timeout, the default is 30 seconds// Configure the proxy.proxy(new ProxyOptions(ProxyOptions.Type.HTTP, new InetSocketAddress("<your-proxy-hostname>", 9001)).setCredentials("<your-proxy-username>", "<your-proxy-password>"))// If it is an https connection, you need to configure the certificate, or ignore the certificate(.ignoreSSL(true)).x509TrustManagers(new X509TrustManager[]{}).keyManagers(new KeyManager[]{}).ignoreSSL(false).build();*/// Configure Credentials authentication information, including ak, secret, tokenStaticCredentialProvider provider = StaticCredentialProvider.create(Credential.builder()// Please ensure that the environment variables ALIBABA_CLOUD_ACCESS_KEY_ID and ALIBABA_CLOUD_ACCESS_KEY_SECRET are set..accessKeyId("XXXXXX").accessKeySecret("XXXXXX")//.securityToken(System.getenv("ALIBABA_CLOUD_SECURITY_TOKEN")) // use STS token.build());// Configure the ClientAsyncClient client = AsyncClient.builder().region("cn-beijing") // Region ID//.httpClient(httpClient) // Use the configured HttpClient, otherwise use the default HttpClient (Apache HttpClient).credentialsProvider(provider)//.serviceConfiguration(Configuration.create()) // Service-level configuration// Client-level configuration rewrite, can set Endpoint, Http request parameters, etc..overrideConfiguration(ClientOverrideConfiguration.create()// Endpoint 请参考 https://api.aliyun.com/product/Dyvmsapi.setEndpointOverride("dyvmsapi.aliyuncs.com")//.setConnectTimeout(Duration.ofSeconds(30))).build();// Parameter settings for API requestSingleCallByTtsRequest singleCallByTtsRequest = SingleCallByTtsRequest.builder().ttsParam("{\"name\":\""+name+"\"}").ttsCode("XXXXXXXX").calledNumber(phone)// Request-level configuration rewrite, can set Http request parameters, etc.// .requestConfiguration(RequestConfiguration.create().setHttpHeaders(new HttpHeaders())).build();// Asynchronously get the return value of the API requestCompletableFuture<SingleCallByTtsResponse> response = client.singleCallByTts(singleCallByTtsRequest);// Synchronously get the return value of the API requestSingleCallByTtsResponse resp = response.get();System.out.println(new Gson().toJson(resp));// Asynchronous processing of return values/*response.thenAccept(resp -> {System.out.println(new Gson().toJson(resp));}).exceptionally(throwable -> { // Handling exceptionsSystem.out.println(throwable.getMessage());return null;});*/// Finally, close the clientclient.close();}}
5)核心prompt工程
这个prompt目的是通过大模型对话引导访客完成关键信息的输出和采集
String PROMPT = "你是一个专业的访客预约助手,需要帮助用户完成以下信息收集:\n"+"1. 访客姓名\n"+"2. 联系方式(手机号)\n"+"3. 访问目的\n"+"4. 预约时间(精确到小时)\n"+"5. 被访人姓名\n"+"要求:\n"+"1.使用中文口语化交流\n"+"每次只问一个问题\n"+"对用户输入进行验证和澄清\n"+"最后确认所有信息";
这个prompt目的是通过分析录入的多轮对话进行关键信息的抽取以便于和代码的结合
String prompt ="你是一个专业的信息抽取助手,请按以下步骤处理对话内容:\n" +"1. 仔细阅读并理解以下对话内容\n" +"2. 提取其中关键的实体、事件和属性信息\n" +"3. 按JSON格式输出结果,包含以下字段:" +" - 姓名 (name)" +" - 访问目的 (purpose)" +" - 时间 (visit_time)" +" - 电话 (phone)" +" - 被访人 (host)" +"对话内容:" + text +"请确保:" +"- 字段值为null时显示为null\n" +"- 时间统一用ISO8601格式\n" +"- 排除无关的闲聊内容";
6)整合代码
package com.XXXXXX;
import com.alibaba.dashscope.common.Message;
import com.alibaba.dashscope.common.Role;
import com.alibaba.dashscope.exception.ApiException;
import com.alibaba.dashscope.exception.InputRequiredException;
import com.alibaba.dashscope.exception.NoApiKeyException;
import com.alibaba.dashscope.exception.UploadFileException;
import java.sql.*;
import java.sql.Connection;
import java.time.LocalDateTime;
import java.util.*;public class VoiceBookingSystem {private static final String PROMPT = "你是一个专业的访客预约助手,需要帮助用户完成以下信息收集:\n"+"1. 访客姓名\n"+"2. 联系方式(手机号)\n"+"3. 访问目的\n"+"4. 预约时间(精确到小时)\n"+"5. 被访人姓名\n"+"要求:\n"+"1.使用中文口语化交流\n"+"每次只问一个问题\n"+"对用户输入进行验证和澄清\n"+"最后确认所有信息";private static final String DB_PATH = "booking.db";// 数据库连接private Connection conn;public VoiceBookingSystem() throws Exception {initDB();}private void initDB() throws SQLException {conn = DriverManager.getConnection("jdbc:sqlite:" + DB_PATH);try (Statement stmt = conn.createStatement()) {stmt.execute("CREATE TABLE IF NOT EXISTS bookings (" +"id INTEGER PRIMARY KEY AUTOINCREMENT," +"name TEXT, phone TEXT, purpose TEXT," +"visit_time TEXT, host TEXT, created_at TIMESTAMP)");}}private Map<String, String> extractInfo(String text) {String prompt ="你是一个专业的信息抽取助手,请按以下步骤处理对话内容:\n" +"1. 仔细阅读并理解以下对话内容\n" +"2. 提取其中关键的实体、事件和属性信息\n" +"3. 按JSON格式输出结果,包含以下字段:" +" - 姓名 (name)" +" - 访问目的 (purpose)" +" - 时间 (visit_time)" +" - 电话 (phone)" +" - 被访人 (host)" +"对话内容:" + text +"请确保:" +"- 字段值为null时显示为null\n" +"- 时间统一用ISO8601格式\n" +"- 排除无关的闲聊内容";List<Message> messages = new ArrayList<>();String system = null;try {system = Deepseek.deepSeek(messages,prompt, Role.USER);} catch (NoApiKeyException | InputRequiredException e) {System.out.println(e.getMessage());}//System.out.println(system);return MapStringParser.parseStringToMap(system);}private void confirmBooking(Map<String, String> info) {// 数据库保存逻辑String sql = "INSERT INTO bookings (name, phone, purpose, visit_time, host, created_at) " +"VALUES (?, ?, ?, ?, ?, ?)";System.out.println(info.get("name"));System.out.println(info.get("phone"));System.out.println(info.get("purpose"));System.out.println(info.get("visit_time"));System.out.println(info.get("host"));try (PreparedStatement pstmt = conn.prepareStatement(sql)) {pstmt.setString(1, info.get("name"));pstmt.setString(2, info.get("phone"));pstmt.setString(3, info.get("purpose"));pstmt.setString(4, info.get("visit_time"));pstmt.setString(5, info.get("host"));pstmt.setTimestamp(6, Timestamp.valueOf(LocalDateTime.now()));pstmt.executeUpdate();} catch (SQLException e) {System.out.println(e.getMessage());}try {//预约成功给预约人和被访人打电话通知Call.callPhone(info.get("phone"),info.get("name"));} catch (Exception e) {System.out.println(e.getMessage());}}private void run() {StringBuffer list = new StringBuffer("SYSTEM:您好!我是语音访客预约助手,请说出您的预约信息。");try {Speak.streamSpeak("您好!我是语音访客预约助手,请说出您的预约信息。");System.out.println("您好!我是语音访客预约助手,请说出您的预约信息。");} catch (ApiException | NoApiKeyException | UploadFileException e) {System.out.println(e.getMessage());}List<Message> messages = new ArrayList<>();messages.add(Deepseek.createMessage(Role.SYSTEM, PROMPT));while (true) {String user = null;user = Recognize3.steamRecognize();System.out.println("USER:"+user);list.append("USER:").append(user);String system = null;try {system = Deepseek.deepSeek(messages, user, Role.USER);} catch (NoApiKeyException | InputRequiredException e) {System.out.println(e.getMessage());}System.out.println("SYSTEM:"+system);list.append("SYSTEM:").append(system);try {Speak.streamSpeak(system);} catch (ApiException | NoApiKeyException | UploadFileException e) {System.out.println(e.getMessage());}// 提取信息Map<String, String> info = extractInfo(list.toString());System.out.println(info);if (info.size() >= 5) {confirmBooking(info);break;}}}public static void main(String[] args) throws Exception {VoiceBookingSystem vb = new VoiceBookingSystem();vb.run();}}
7)pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>com.lihao</groupId><artifactId>zhinengjiaohu</artifactId><version>1.0-SNAPSHOT</version><properties><maven.compiler.source>8</maven.compiler.source><maven.compiler.target>8</maven.compiler.target><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding></properties><!-- pom.xml 示例 --><dependencies><!-- https://mvnrepository.com/artifact/com.alibaba/dashscope-sdk-java --><dependency><groupId>com.alibaba</groupId><artifactId>dashscope-sdk-java</artifactId><version>2.20.0</version></dependency><dependency><groupId>com.aliyun</groupId><artifactId>alibabacloud-dyvmsapi20170525</artifactId><version>2.0.0</version></dependency><dependency><groupId>com.squareup.okhttp3</groupId><artifactId>okhttp</artifactId><version>4.9.3</version></dependency><dependency><groupId>org.xerial</groupId><artifactId>sqlite-jdbc</artifactId><version>3.36.0.3</version></dependency></dependencies></project>
8)工具类
package com.XXXX;import java.util.HashMap;
import java.util.Map;public class MapStringParser {public static Map<String, String> parseStringToMap(String input) {Map<String, String> resultMap = new HashMap();if (input != null && !input.trim().isEmpty()) {input = input.replaceAll("/\\*.*?\\*/", "").replaceAll("`", "").replaceAll("'", "").replaceAll("\"", "").replaceAll("}", "").replaceAll("\\{", "").replaceAll("json", "").replaceAll("[\\u0000-\\u001F]", "").replaceAll("<script>", "");String[] pairs = input.trim().split("\\s*,\\s*");for(String pair : pairs) {String[] keyValue = pair.split("\\s*:\\s*", 2);if (keyValue.length == 2) {String key = keyValue[0].trim().toLowerCase().replace("\"", "");String value = keyValue[1].trim().replace("\"", "");if (!value.isEmpty() && !value.equals("null")) {resultMap.put(key, value);}}}return resultMap;} else {return resultMap;}}public static void main(String[] args) {String input = "name: 张三 , phone:13812345678, purpose:拜访, visit_time:2023-10-25 14:30, host:李四";Map<String, String> result = parseStringToMap(input);result.forEach((key, value) -> System.out.println(key + " : " + value));}
}
3.流程演示
4.存在的问题
整体流程算是磕磕绊绊下来了,虽然验证了整体路径,但是还是存在很多问题:
1.提示词很关键,需要不断的结合大模型去磨合去调整,才能更加精准。包括引导对话的提示词,还有关键信息提取的提示词。
2.对话模型的理解上随机性比较大,有时候流程很完美,有时候流程差强人意,比如上面的流程模型都回复结束了,又突然意识到预约的时间不够精准。
3.语音识别模型的效率和准确度关系到整体的体验。
4.语音合成模型的效率和自然度关系到整体的体验。
5.语音拨打这一块倒是已经很成熟,,但是号码被标记成了骚扰电话。
6.前置流程可以增加人脸图片的抓取和识别,这样要是存在预约记录的可以直接调取与访客确认修改。