在Jetson部署AI语音家居助手(二):语音激活+语音转文字
前言
在jetson上搭建一个AI语音家居助手,主要需要四个部分的工作:
- 1️⃣获取麦克风输入,语音激活功能+语音转文字功能
- 2️⃣部署大语言模型理解转换后的文字输入 (本地推理或云推理)
- 3️⃣提取大模型推理文字结果后,执行文字转语音功能,播放推理结果
- 4️⃣根据推理结果,选择动作执行(控制家居设备)
本文主要完成1️⃣的搭建,2️⃣的搭建可以笔者之前的文章:
https://blog.csdn.net/Vingnir/article/details/149119735?spm=1001.2014.3001.5502
该部分功能的参考源码放在github仓库:
硬件构成
该部分主要有三个硬件构成;
Jetson主控- Orin NX 16g super,其他Orin系列产品也可。
麦克风- 四通道,单通道和双通道也可以,用于获取用户语音输入。
扬声器-USB接口或者其他接口都行,可以播放声音就行。
本人使用硬件如下:
主控:https://www.seeedstudio.com/reComputer-Super-J4012-p-6443.html
麦克风:https://www.seeedstudio.com/ReSpeaker-4-Mic-Array-for-Raspberry-Pi.htm种编译好的wav2text
程序:
pasuspender -- sudo ./wav2text
然后运行record_lite
:
pasuspender -- sudo ./record_lite
运行的时候增加pasuspender
选项是因为需要挂起PulseAudio,以便我们的程序后端可以直接访问ALSA设备,而不会被PulseAudio占用,避免音频设备冲突问题。
运行结果如下:
record_lite
进程检测到声音能量大于阈值后开始录音,没有输入超过mute的时间阈值后结束录音:
wav2text
完成语音转文字的请求,输出语音转文字结果,并进行语音激活检测,激活默认播放activate.mp3
,可以替换为你想触发激活响应语音:
whisper-server
完成语音转文字的推理请求,并把推理结果发送给wav2text
:
总结
至此,我们完成了AI语音家居助手搭建的第二个模块,结合本地部署的大语言模型或云端的大语言模型,就实现一个雏形,剩下的工作就是action部分,并把大模型的输出整理后转为语音输出增加互动性。
附录
这里把主要的源码贴一下,如果访问不了github可以直接从这里复制:
------------respeaker.cpp------------⬇
#include <alsa/asoundlib.h>
#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <thread>#define SAMPLE_RATE 44100
#define BIT_PER_SAMPLE 16
#define CHANNELS 2
#define FORMAT SND_PCM_FORMAT_S16_LE
#define RECORD_MS 10000 //最大录音时长
#define SLIENCE_MS 2000 //讲话结束静音时长判断阈值
#define DEVICE_NAME "plughw:2,0" // 改为你的麦克风 device,hw:2,0兼容性不好,建议不要用typedef int16_t bit_per;#include "voice_gate.hpp"std::ofstream wavFile;
size_t totalBytes = 0;unsigned int sample_rate = SAMPLE_RATE;
int dir;
snd_pcm_format_t format;
unsigned int channels = 4;void write_wav_header(std::ofstream &file, int sample_rate, int num_channels, int bits_per_sample) {char header[44] = {0};int16_t audio_format = 1; // PCMint32_t byte_rate = sample_rate * num_channels * bits_per_sample / 8;int16_t block_align = num_channels * bits_per_sample / 8;// RIFF chunkmemcpy(header, "RIFF", 4);*(int32_t*)(header + 4) = 0; // Placeholder for file sizememcpy(header + 8, "WAVE", 4);// fmt subchunkmemcpy(header + 12, "fmt ", 4);*(int32_t*)(header + 16) = 16; // Subchunk1Size for PCM*(int16_t*)(header + 20) = audio_format;*(int16_t*)(header + 22) = num_channels;*(int32_t*)(header + 24) = sample_rate;*(int32_t*)(header + 28) = byte_rate;*(int16_t*)(header + 32) = block_align;*(int16_t*)(header + 34) = bits_per_sample;// data subchunkmemcpy(header + 36, "data", 4);*(int32_t*)(header + 40) = 0; // Placeholder for data sizefile.write(header, 44);
}void finalize_wav(std::ofstream &file, size_t dataBytes) {file.seekp(4, std::ios::beg);int32_t fileSize = 36 + dataBytes;file.write(reinterpret_cast<char*>(&fileSize), 4);file.seekp(40, std::ios::beg);file.write(reinterpret_cast<char*>(&dataBytes), 4);
}int main() {snd_pcm_t *pcm_handle;snd_pcm_hw_params_t *params;int dir;int rc = -1;rc = snd_pcm_open(&pcm_handle, DEVICE_NAME, SND_PCM_STREAM_CAPTURE, 0);if (rc < 0) {std::cerr << "Unable to open PCM device: " << snd_strerror(rc) << std::endl;return 1;}// 分配参数结构snd_pcm_hw_params_malloc(¶ms);snd_pcm_hw_params_any(pcm_handle, params);// 获取采样率snd_pcm_hw_params_get_rate(params, &sample_rate, &dir);std::cout << "采样率: " << sample_rate << " Hz" << std::endl;// 获取通道数snd_pcm_hw_params_get_channels(params, &channels);std::cout << "通道数: " << channels << std::endl;// 获取音频格式snd_pcm_hw_params_get_format(params, &format);std::cout << "格式: " << snd_pcm_format_name(format)<< " (" << snd_pcm_format_description(format) << ")" << std::endl;//config the PCM_handlesnd_pcm_hw_params_set_access(pcm_handle, params, SND_PCM_ACCESS_RW_INTERLEAVED);snd_pcm_hw_params_set_format(pcm_handle, params, FORMAT);snd_pcm_hw_params_set_channels(pcm_handle, params, CHANNELS);snd_pcm_hw_params_set_rate(pcm_handle, params, sample_rate, 0);snd_pcm_hw_params(pcm_handle, params);snd_pcm_hw_params_free(params);snd_pcm_prepare(pcm_handle);int buffer_frames = 1024 * 2; // bit_per* buffer = new bit_per[buffer_frames](); //强制零初始化int buffer_l_frames = 1024 ; // bit_per* buffer_l = new bit_per[buffer_l_frames * CHANNELS](); //强制零初始化//int16_t* buffer = (int16_t*) aligned_alloc(16, buffer_frames * sizeof(int16_t));//memset(buffer, 0, buffer_frames * sizeof(int16_t));std::string filename = "respeaker_output.wav";//实例化输入声响阈值VoiceGate gate(/*energyThresh*/ 500, //单帧能量阈值/*silenceMs */ RECORD_MS,/*sampleRate */ SAMPLE_RATE,/*frameSamples*/ buffer_l_frames * CHANNELS);size_t total_samples = SAMPLE_RATE * RECORD_MS/1000;size_t recorded_samples = 0;while(true){rc = snd_pcm_readi(pcm_handle, buffer_l, buffer_l_frames);if (rc < 0) continue;if (rc > buffer_l_frames) {std::cerr << "rc = " << rc << " exceeds buffer_frames = " << buffer_frames << std::endl;continue;}if(rc >0 ){//printf("Read %d frames from PCM device.\n", rc);gate.update(buffer_l, rc * CHANNELS);}bool enable = false;if(gate.startSegment()) {std::cout << "Start recording segment...٩(๑>◡<๑)۶" << std::endl; if (wavFile.is_open()) wavFile.close();wavFile.open(filename, std::ios::binary);write_wav_header(wavFile, SAMPLE_RATE, CHANNELS, BIT_PER_SAMPLE); // 32-bit PCMwavFile.write(reinterpret_cast<char*>(buffer_l), rc * CHANNELS * sizeof(bit_per));recorded_samples += rc;totalBytes += rc * sizeof(bit_per);while((recorded_samples < total_samples)&&(gate.stopSegment() == false)){//printf("breakpoint\n" );if(enable == true){rc = snd_pcm_readi(pcm_handle, buffer_l, buffer_l_frames);}enable = true;gate.update(buffer_l, rc * CHANNELS);if (rc == -EPIPE) {std::cerr << "Overrun occurred" << std::endl;snd_pcm_prepare(pcm_handle);continue;} else if (rc < 0) {std::cerr << "Error reading: " << snd_strerror(rc) << std::endl;break;}//wavFile.write((char *)buffer_l, rc * CHANNELS * sizeof(bit_per));wavFile.write(reinterpret_cast<char*>(buffer_l), rc * CHANNELS * sizeof(bit_per));recorded_samples += rc;totalBytes += rc * CHANNELS * sizeof(bit_per);}std::cout << "Recording is over..(>^ω^<)" << std::endl;recorded_samples = 0;finalize_wav(wavFile, totalBytes);wavFile.close();//free(buffer);totalBytes = 0;}}snd_pcm_drain(pcm_handle);snd_pcm_close(pcm_handle);delete[] buffer;delete[] buffer_l;std::cout << "Saved to " << filename << std::endl;return 0;
}
------------voice_gate.hpp------------⬇
#ifndef VOICE_GATE_HPP
#define VOICE_GATE_HPP#include <stdio.h>
#include <cstdint>
#include <cmath>typedef int16_t bit_per; // feed one frame class VoiceGate {
public:VoiceGate(double energyThresh, // e.g. 800int silenceMs, // e.g. 1000 int sampleRate, // e.g. 16000int frameSamples) // e.g. : THRESH(energyThresh),SILENCE_LIMIT_FRAMES((silenceMs * sampleRate) / (1000 * frameSamples)),samplesPerFrame(frameSamples) {}void update(const bit_per* data, int len) {double e = rmsEnergy(data, len);double log_sound = e;//printf("this frame e is %lf\n",log_sound);if (state == Standby) {if (e > THRESH) { // speech startsstate = Recording;silenceCtr = 0;started = true;stopped = false;}} else { // Recordingif (e > THRESH) {re_mute = true;silenceCtr = 0; // still speech} else {if(re_mute = true){start_time = std::chrono::high_resolution_clock::now();re_mute = false;}++silenceCtr;if((silenceCtr > SILENCE_LIMIT_FRAMES)){state = Standby; // speech endedstarted =false;stopped = true;auto over_time = std::chrono::high_resolution_clock::now();std::chrono::duration<double, std::milli> duration_ms = over_time - start_time;//std::cout << "mute time: " << duration_ms.count() << " 毫秒" << std::endl;}}}}bool isRecording() const { return state == Recording; }bool startSegment() { bool f = started; started = false; return f; }bool stopSegment() { bool f = stopped; return f; }private:enum { Standby, Recording } state = Standby;const float THRESH;const int SILENCE_LIMIT_FRAMES;const int samplesPerFrame;int silenceCtr = 0;bool started = false;bool stopped = false;bool re_mute = true;std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start_time;double rmsEnergy(const bit_per* buf, int len) const {double sum = 0.0f;bit_per tmp_data = 0;len > (samplesPerFrame)? len = (samplesPerFrame) : len = len; for (int i = 0; i < len; ++i){if (buf[i] < -2147483648 || buf[i] > 2147483647) {fprintf(stderr, "Error: Sample value out of range: %d\n", buf[i]);return 0.0; // Return 0 if sample is out of range}tmp_data = abs(buf[i]); // Ensure the value is non-negativesum += tmp_data;//printf("buf[%d] is %d\n", i, buf[i]);}double ave = sum / len;return ave;}
};
#endif
------------shared_state.h------------⬇
// shared_state.h
#ifndef SHARED_STATE_H
#define SHARED_STATE_Henum class VoiceState {RECORDED,RECORDING,DECODE,PLAYING,RESTART,INIT
};#endif
------------wav2text.cpp------------⬇
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string>
#include <fcntl.h>
#include <sys/stat.h> // for mkfifo
#include <sys/types.h> // for mode_t
#include <string>
#include <chrono>
#include <thread>
#include <curl/curl.h>
#include <nlohmann/json.hpp>#include "shared_state.h"
using json = nlohmann::json;// 用于保存返回数据
static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp) {((std::string*)userp)->append((char*)contents, size * nmemb);return size * nmemb;
}std::string transcribe_audio(const std::string& filepath) {CURL *curl;CURLcode res;std::string readBuffer;curl_mime *form = nullptr;curl_mimepart *field = nullptr;curl_global_init(CURL_GLOBAL_ALL);curl = curl_easy_init();if(curl) {form = curl_mime_init(curl);// 添加 WAV 文件field = curl_mime_addpart(form);curl_mime_name(field, "file");curl_mime_filedata(field, filepath.c_str());// 设置 response_format = jsonfield = curl_mime_addpart(form);curl_mime_name(field, "response_format");curl_mime_data(field, "json", CURL_ZERO_TERMINATED);curl_easy_setopt(curl, CURLOPT_URL, "http://127.0.0.1:8080/inference");curl_easy_setopt(curl, CURLOPT_MIMEPOST, form);// 设置回调函数保存返回内容curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);// 执行请求res = curl_easy_perform(curl);// 检查错误if(res != CURLE_OK)std::cerr << "curl_easy_perform() failed: " << curl_easy_strerror(res) << std::endl;curl_mime_free(form);curl_easy_cleanup(curl);}curl_global_cleanup();return readBuffer;
}bool contains_keywords(const std::string& text, const std::vector<std::string>& keywords) {for (const auto& keyword : keywords) {if (text.find(keyword) != std::string::npos) {return true; // Keyword found}}return false; // No keyword found
}int main() {mkfifo("/tmp/state_pipe2", 0666); // 创建命名管道int pipe_fd = open("/tmp/state_pipe", O_RDONLY | O_NONBLOCK);int pipe_ba = open("/tmp/state_pipe2", O_WRONLY | O_NONBLOCK );VoiceState state = VoiceState::INIT;VoiceState state_ba = VoiceState::INIT;while(1){read(pipe_fd, &state, sizeof(state));if(state == VoiceState::RECORDED){printf("breakpoint\n");state_ba = VoiceState::DECODE;write(pipe_ba, &state_ba, sizeof(state_ba));std::string result_json = transcribe_audio("respeaker_output.wav");std::cout << "Whisper 推理结果:\n" << result_json << std::endl;// Parse JSONnlohmann::json j = nlohmann::json::parse(result_json);std::string text = j["text"];// Define multiple keywordsstd::vector<std::string> keywords = {"hello", "Hello", "super", "Super"};//delete respeaker_output.wav// Check if any of the keywords are found in the textif (contains_keywords(text, keywords)) {std::cout << "One or more keywords matched!" << std::endl;const std::string device = "plughw:0,3"; // 对应 card 0, device 3const std::string file = "activate.mp3";std::string cmd = "mpg123 -a " + device + " \"" + file + "\"";std::cout << "Executing: " << cmd << std::endl;int ret = system(cmd.c_str());if (ret != 0) {std::cerr << "mpg123 playback failed!" << std::endl;}state_ba = VoiceState::RESTART;write(pipe_ba, &state_ba, sizeof(state_ba));} else {std::cout << "No keywords matched." << std::endl;state_ba = VoiceState::RESTART;write(pipe_ba, &state_ba, sizeof(state_ba));}}}close(pipe_fd);close(pipe_ba);return 0;
}