语音LLM应用
实验04-多模态图片比较-语音对话
实验准备:
- 确保已接入火山引擎豆包AI以及讯飞AI(参考实验01、实验02)
- 寻找图片,作为实验素材。图片导入分为相对路径以及绝对路径,相对路径默认设置为AI_online_voice/assets/sample.jpg(功能包中已添加了默认的相对路径图片,可更改相对路劲图片,但命名需为sample.jpg)
实验步骤:(确保语音模块已连接)
cd AI_online_voice#进入主目录python examples/04_voice_image_comparison.py#运行示例程序- 进入程序后根据终端提示,先输入y,进入图片选择,可语音选择绝对路径以及相对路径,绝对路径手动输入图片路径,相对路劲默认设置为assets/sample.jpg 。
终端运行示例:
图片设置:

图文对比分析:

# -*- coding: utf-8 -*-
"""
04_voice_image_comparison.py
实验04:图片比较 - 语音输入
- 参考实验03:语音选择路径(绝对/相对),相对路径默认 assets/sample.jpg
- 选择图片一与图片二;录音文本与两图一起提交给豆包进行比较分析
指令:
- i1:选择图片一(语音选择绝对/相对路径)
- i2:选择图片二(语音选择绝对/相对路径)
- r [秒数]:录音并提交到豆包进行两图分析(默认5秒)
- p:回放最近一次录音
- h:帮助
- q:退出
"""
import os
import sys
import json
import base64
import wave
from typing import Optional
import importlib.util
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(CURRENT_DIR)
sys.path.append(PROJECT_ROOT)
from utils.audio_processor import AudioProcessor
import config
# 动态导入实验03模块,复用内联的客户端
EXP03_PATH = os.path.join(PROJECT_ROOT, "examples", "03_voice_image_dialogue.py")
spec = importlib.util.spec_from_file_location("exp03", EXP03_PATH)
exp03 = importlib.util.module_from_spec(spec)
spec.loader.exec_module(exp03)
DoubaoAPIClient = exp03.DoubaoAPIClient
XunfeiRealtimeSpeechClient = exp03.XunfeiRealtimeSpeechClient
class VoiceImageComparisonApp:
def __init__(self):
self.processor = AudioProcessor()
self.asr = XunfeiRealtimeSpeechClient()
self.doubao = DoubaoAPIClient()
self.last_audio: Optional[str] = None
self.last_wav: Optional[str] = None
self.image_path1: Optional[str] = None
self.image_path2: Optional[str] = None
def _resolve_path(self, p: str, is_absolute: bool = False) -> Optional[str]:
if not p:
return None
p = os.path.expanduser(p)
if os.name != "nt":
p = p.replace("\\", "/")
if is_absolute or os.path.isabs(p):
return os.path.abspath(p)
return os.path.abspath(os.path.join(PROJECT_ROOT, p))
def print_help(self):
print("\n指令帮助:")
print(" i1 选择图片一(绝对路径手动;相对路径默认 assets/sample.jpg)")
print(" i2 选择图片二(绝对路径手动;相对路径默认 assets/sample.jpg)")
print(" r [秒数] 录音并提交两图比较分析(默认 5 秒)")
print(" p 回放最近一次录音")
print(" h 查看帮助")
print(" q 退出\n")
def _select_image(self, which: int):
assert which in (1, 2)
label = "图片一" if which == 1 else "图片二"
print(f"[{label}选择] 录音 5 秒选择路径类型(说:绝对路径 或 相对路径;相对路径默认 assets/sample.jpg)")
audio_file = self.processor.record(5)
if not audio_file:
print("[错误] 路径类型录音失败")
return
wav_path = self.processor.convert_to_wav(audio_file) or audio_file
selection_text = None
try:
selection_text = self.asr.transcribe_audio_ws(wav_path)
except Exception as e:
print(f"[识别异常] {e}")
choice = None
if selection_text:
t = selection_text.lower()
if ("绝对" in t) or ("absolute" in t):
choice = "abs"
elif ("相对" in t) or ("relative" in t):
choice = "rel"
if not choice:
print("[提示] 未识别到路径类型。请输入:abs(绝对) 或 rel(相对)")
try:
choice = input("路径类型(abs/rel): ").strip().lower()
except Exception:
return
is_abs = choice.startswith("a")
if is_abs:
path_input = input(f"请输入{label}绝对路径: ").strip()
final_path = self._resolve_path(path_input, is_absolute=True)
else:
rel_default = "assets/sample.jpg"
print(f"[使用默认相对路径] {rel_default}")
final_path = self._resolve_path(rel_default, is_absolute=False)
if not final_path or not os.path.exists(final_path):
print(f"[错误] 图像文件不存在: {final_path}")
print("[示例] 绝对: /home/user/pic.jpg | 相对: assets/sample.jpg")
return
ext = os.path.splitext(final_path)[1].lower()
if ext not in (".jpg", ".jpeg", ".png"):
print("[错误] 仅支持 JPG/JPEG/PNG 格式")
return
if which == 1:
self.image_path1 = final_path
else:
self.image_path2 = final_path
print(f"[已设置{label}] {final_path}")
def _build_image_content(self, text: str) -> list:
content = [{"type": "text", "text": text}]
for p in [self.image_path1, self.image_path2]:
if not p:
continue
ext = os.path.splitext(p)[1].lower()
mime = "image/jpeg" if ext in (".jpg", ".jpeg") else "image/png"
with open(p, "rb") as f:
b64 = base64.b64encode(f.read()).decode("utf-8")
content.append({"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}})
return content
def handle_record(self, duration_sec: int):
print(f"[操作] 开始录音 {duration_sec} 秒…")
audio_file = self.processor.record(duration_sec)
if not audio_file:
print("[错误] 录音失败")
return
self.last_audio = audio_file
try:
with wave.open(audio_file, "rb") as wf:
print(f"[音频信息] rate={wf.getframerate()}, ch={wf.getnchannels()}, bits={wf.getsampwidth()*8}")
except Exception:
pass
wav_path = self.processor.convert_to_wav(audio_file)
if not wav_path:
print("[错误] 转换 WAV 失败")
return
self.last_wav = wav_path
print("[识别] 讯飞实时识别…")
text = self.asr.transcribe_audio_ws(wav_path)
if not text:
print("[识别失败] 未获取到文本")
return
print(f"[识别结果] {text}")
print("[豆包] 提交两图比较分析…")
try:
sys_prompt = getattr(exp03, "ROOT_CONFIG", None)
sys_prompt = getattr(sys_prompt, "SYSTEM_PROMPT", None) if sys_prompt else None
messages = []
if sys_prompt:
messages.append({"role": "system", "content": sys_prompt})
messages.append({"role": "user", "content": self._build_image_content(text)})
result = self.doubao._make_request(messages)
if result and result.get("choices"):
print("[豆包回复]", result["choices"][0]["message"]["content"])
else:
print("[豆包回复] None")
except Exception as e:
print("[豆包错误]", e)
def handle_play(self):
if not self.last_audio:
print("[提示] 尚无可回放的录音。请先使用 r 指令录音。")
return
print("[播放] 回放最近一次录音…")
self.processor.play(self.last_audio)
def run(self):
print("\n=== 04 图片比较(语音选择两图 + 讯飞 + 豆包)实验 ===")
self.print_help()
try:
first = input("是否先选择图片一? (y/n): ").strip().lower()
if first.startswith("y"):
self._select_image(1)
second = input("是否选择图片二? (y/n): ").strip().lower()
if second.startswith("y"):
self._select_image(2)
except Exception:
pass
while True:
try:
cmd = input("请输入指令 (i1/i2/r/p/h/q): ").strip()
except (EOFError, KeyboardInterrupt):
print("\n[退出]")
break
if not cmd:
continue
if cmd == "q":
print("[退出]")
break
if cmd == "h":
self.print_help()
continue
if cmd == "p":
self.handle_play()
continue
if cmd == "i1":
self._select_image(1)
continue
if cmd == "i2":
self._select_image(2)
continue
if cmd.startswith("r"):
parts = cmd.split()
duration = 5
if len(parts) >= 2:
try:
duration = int(parts[1])
except Exception:
print("[提示] 秒数无效,使用默认 5 秒")
self.handle_record(duration)
continue
print("[提示] 未知指令。输入 h 查看帮助。")
if __name__ == "__main__":
VoiceImageComparisonApp().run()