Jetson 音频/语音处理:Whisper 语音识别与 TTS
2026/7/4 8:16:57 网站建设 项目流程

Jetson 音频/语音处理:Whisper 语音识别与 TTS

1. Jetson 音频硬件

# 检查音频设备arecord-l# 录音设备aplay-l# 播放设备# USB 麦克风(推荐)arecord-Dplughw:1,0-fS16_LE-r16000-c1test.wav-d5aplay test.wav# 安装音频工具sudoaptinstall-yalsa-utils pulseaudio portaudio19-dev pip3installpyaudio sounddevice

2. Whisper 语音识别

2.1 安装

# 安装 whisper.cpp(C++ 版本,性能更好)gitclone https://github.com/ggerganov/whisper.cpp.gitcdwhisper.cppmake-j$(nproc)# 下载模型bash./models/download-ggml-model.sh base.en# 安装 Python 绑定pip3installopenai-whisper# 或 faster-whisper(推荐)pip3installfaster-whisper

2.2 实时语音识别

#!/usr/bin/env python3"""whisper_realtime.py - 实时语音识别"""importnumpyasnpimportpyaudioimportthreadingimportqueuefromfaster_whisperimportWhisperModelclassRealtimeWhisper:"""实时语音识别"""def__init__(self,model_size="base",device="cuda"):self.model=WhisperModel(model_size,device=device,compute_type="float16")self.audio_queue=queue.Queue()self.sample_rate=16000self.chunk_duration=3# 每 3 秒识别一次self.running=Falsedefstart(self):"""启动识别"""self.running=True# 录音线程self.record_thread=threading.Thread(target=self._record_loop,daemon=True)self.record_thread.start()# 识别线程self.transcribe_thread=threading.Thread(target=self._transcribe_loop,daemon=True)self.transcribe_thread.start()def_record_loop(self):"""录音循环"""p=pyaudio.PyAudio()stream=p.open(format=pyaudio.paFloat32,channels=1,rate=self.sample_rate,input=True,frames_per_buffer=1024)chunk_size=int(self.sample_rate*self.chunk_duration)buffer=[]whileself.running:data=np.frombuffer(stream.read(1024),dtype=np.float32)buffer.extend(data)iflen(buffer)>=chunk_size:self.audio_queue.put(np.array(buffer,dtype=np.float32))buffer=[]stream.stop_stream()stream.close()p.terminate()def_transcribe_loop(self):"""识别循环"""whileself.running:try:audio=self.audio_queue.get(timeout=0.5)# Whisper 识别segments,info=self.model.transcribe(audio,beam_size=5,language="zh",vad_filter=True)forsegmentinsegments:text=segment.text.strip()iftext:print(f"[{segment.start:.1f}s-{segment.end:.1f}s]{text}")exceptqueue.Empty:continuedefstop(self):self.running=Falseif__name__=="__main__":whisper=RealtimeWhisper(model_size="base",device="cuda")whisper.start()try:whileTrue:passexceptKeyboardInterrupt:whisper.stop()

2.3 Whisper TensorRT 加速

#!/usr/bin/env python3"""whisper_trt.py - Whisper TensorRT 加速"""importtensorrtastrtimportnumpyasnpdefconvert_whisper_to_trt(whisper_model_path,trt_engine_path):"""将 Whisper 模型转换为 TensorRT"""# whisper.cpp 已支持 CUDA 加速# 使用 faster-whisper 的 CTranslate2 后端fromfaster_whisperimportWhisperModel model=WhisperModel("base",device="cuda",compute_type="float16",# FP16 推理cpu_threads=4)returnmodel# 性能对比(Orin NX 16GB):# ┌─────────────┬──────────┬──────────┐# │ 模型 │ FP32 │ FP16 │# ├─────────────┼──────────┼──────────┤# │ tiny │ 15x │ 25x │# │ base │ 8x │ 15x │# │ small │ 3x │ 6x │# │ medium │ 1x │ 2.5x │# └─────────────┴──────────┴──────────┘# * 表示实时倍率(>1x 表示快于实时)

3. TTS 语音合成

#!/usr/bin/env python3"""tts_jetson.py - 语音合成"""fromTTS.apiimportTTSclassJetsonTTS:"""Jetson 语音合成"""def__init__(self,model_name="tts_models/zh-CN/baker/tacotron2-DDC-GST"):self.tts=TTS(model_name).to("cuda")defsynthesize(self,text,output_path="output.wav"):"""合成语音"""self.tts.tts_to_file(text=text,file_path=output_path)print(f"语音已保存:{output_path}")defspeak(self,text):"""实时播放"""importsubprocess self.synthesize(text,"/tmp/tts_output.wav")subprocess.run(["aplay","/tmp/tts_output.wav"])if__name__=="__main__":tts=JetsonTTS()tts.speak("你好,我是 Jetson 语音助手")

4. 语音唤醒词检测

#!/usr/bin/env python3"""wake_word.py - 语音唤醒词"""importpvporcupineimportpyaudioimportstructclassWakeWordDetector:"""唤醒词检测(Porcupine)"""def__init__(self,keyword_paths=None,sensitivities=None):self.porcupine=pvporcupine.create(access_key="YOUR_ACCESS_KEY",keyword_paths=keyword_paths,sensitivities=sensitivitiesor[0.5])self.pa=pyaudio.PyAudio()self.stream=self.pa.open(rate=self.porcupine.sample_rate,channels=1,format=pyaudio.paInt16,input=True,frames_per_buffer=self.porcupine.frame_length)deflisten(self):"""监听唤醒词"""print("等待唤醒词...")whileTrue:pcm=self.stream.read(self.porcupine.frame_length)pcm=struct.unpack_from("h"*self.porcupine.frame_length,pcm)keyword_index=self.porcupine.process(pcm)ifkeyword_index>=0:print(f"唤醒词检测到!索引:{keyword_index}")returnkeyword_indexdefcleanup(self):self.stream.close()self.pa.terminate()self.porcupine.delete()

5. 完整语音助手

#!/usr/bin/env python3"""voice_assistant.py - Jetson 语音助手"""importthreadingimportqueueclassVoiceAssistant:"""语音助手"""def__init__(self):self.wake_detector=WakeWordDetector()self.whisper=RealtimeWhisper(model_size="base")self.tts=JetsonTTS()self.command_queue=queue.Queue()defrun(self):"""运行助手"""print("语音助手已启动,等待唤醒词...")whileTrue:# 等待唤醒词self.wake_detector.listen()print("已唤醒,请说话...")# 语音识别text=self.whisper.recognize_once()print(f"识别结果:{text}")# 处理命令response=self.process_command(text)print(f"回复:{response}")# 语音播报self.tts.speak(response)defprocess_command(self,text):"""处理语音命令"""text=text.lower()if"天气"intext:return"今天天气晴朗,温度 25 度"elif"时间"intext:fromdatetimeimportdatetimereturnf"现在时间是{datetime.now().strftime('%H:%M')}"elif"拍照"intext:return"已拍照保存"else:return"抱歉,我没有听懂"if__name__=="__main__":assistant=VoiceAssistant()assistant.run()

总结

功能方案延迟
语音识别faster-whisper (base)<1s
语音合成TTS (tacotron2)<2s
唤醒词Porcupine<100ms
实时转写whisper.cpp + VAD<3s

核心要点:

  1. faster-whisper:比 OpenAI Whisper 快 4x,支持 FP16
  2. GPU 加速:Whisper 和 TTS 都可以用 GPU 推理
  3. VAD 过滤:语音活动检测减少无效推理
  4. Porcupine:低功耗唤醒词检测,适合常开场景

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询