语音LLM应用 - 实验04-多模态图片比较-语音对话

实验准备：
确保已接入火山引擎豆包AI以及讯飞AI(参考实验01、实验02）
寻找图片，作为实验素材。图片导入分为相对路径以及绝对路径，相对路径默认设置为AI_online_voice/assets/sample.jpg（功能包中已添加了默认的相对路径图片，可更改相对路劲图片，但命名需为sample.jpg）
实验步骤：（确保语音模块已连接）
cd AI_online_voice #进入主目录
python examples/04_voice_image_comparison.py #运行示例程序
进入程序后根据终端提示，先输入y，进入图片选择，可语音选择绝对路径以及相对路径，绝对路径手动输入图片路径，相对路劲默认设置为assets/sample.jpg 。
终端运行示例：
图片设置：
图文对比分析：
# -*- coding: utf-8 -*-
"""
04_voice_image_comparison.py

实验04：图片比较 - 语音输入
- 参考实验03：语音选择路径（绝对/相对），相对路径默认 assets/sample.jpg
- 选择图片一与图片二；录音文本与两图一起提交给豆包进行比较分析

指令：
- i1：选择图片一（语音选择绝对/相对路径）
- i2：选择图片二（语音选择绝对/相对路径）
- r [秒数]：录音并提交到豆包进行两图分析（默认5秒）
- p：回放最近一次录音
- h：帮助
- q：退出
"""

import os
import sys
import json
import base64
import wave
from typing import Optional
import importlib.util

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(CURRENT_DIR)
sys.path.append(PROJECT_ROOT)

from utils.audio_processor import AudioProcessor
import config

# 动态导入实验03模块，复用内联的客户端
EXP03_PATH = os.path.join(PROJECT_ROOT, "examples", "03_voice_image_dialogue.py")
spec = importlib.util.spec_from_file_location("exp03", EXP03_PATH)
exp03 = importlib.util.module_from_spec(spec)
spec.loader.exec_module(exp03)

DoubaoAPIClient = exp03.DoubaoAPIClient
XunfeiRealtimeSpeechClient = exp03.XunfeiRealtimeSpeechClient


class VoiceImageComparisonApp:
    def __init__(self):
        self.processor = AudioProcessor()
        self.asr = XunfeiRealtimeSpeechClient()
        self.doubao = DoubaoAPIClient()
        self.last_audio: Optional[str] = None
        self.last_wav: Optional[str] = None
        self.image_path1: Optional[str] = None
        self.image_path2: Optional[str] = None

    def _resolve_path(self, p: str, is_absolute: bool = False) -> Optional[str]:
        if not p:
            return None
        p = os.path.expanduser(p)
        if os.name != "nt":
            p = p.replace("\\", "/")
        if is_absolute or os.path.isabs(p):
            return os.path.abspath(p)
        return os.path.abspath(os.path.join(PROJECT_ROOT, p))

    def print_help(self):
        print("\n指令帮助：")
        print("  i1       选择图片一（绝对路径手动；相对路径默认 assets/sample.jpg）")
        print("  i2       选择图片二（绝对路径手动；相对路径默认 assets/sample.jpg）")
        print("  r [秒数]  录音并提交两图分析（默认 5 秒）")
        print("  p        回放最近一次录音")
        print("  h        查看帮助")
        print("  q        退出\n")

    def _select_image(self, which: int):
        label = "图片一" if which == 1 else "图片二"
        print(f"[选择{label}] 录音 5 秒选择路径类型（说：绝对路径 或 相对路径；相对路径默认 assets/sample.jpg）")
        audio_file = self.processor.record(5)
        if not audio_file:
            print("[错误] 路径类型录音失败")
            return
        wav_path = self.processor.convert_to_wav(audio_file) or audio_file
        selection_text = None
        try:
            selection_text = self.asr.transcribe_audio_ws(wav_path)
        except Exception as e:
            print(f"[识别异常] {e}")
        choice = None
        if selection_text:
            t = selection_text.lower()
            if ("绝对" in t) or ("absolute" in t):
                choice = "abs"
            elif ("相对" in t) or ("relative" in t):
                choice = "rel"
        if not choice:
            print("[提示] 未识别到路径类型。请输入：abs(绝对) 或 rel(相对)")
            try:
                choice = input("路径类型(abs/rel): ").strip().lower()
            except Exception:
                return
        is_abs = choice.startswith("a")
        if is_abs:
            path_input = input(f"请输入{label}绝对路径: ").strip()
            final_path = self._resolve_path(path_input, is_absolute=True)
        else:
            rel_default = "assets/sample.jpg"
            print(f"[使用默认相对路径] {rel_default}")
            final_path = self._resolve_path(rel_default, is_absolute=False)
        if not final_path or not os.path.exists(final_path):
            print(f"[错误] 图像文件不存在: {final_path}")
            print("[示例] 绝对: /home/user/pic.jpg | 相对: assets/sample.jpg")
            return
        ext = os.path.splitext(final_path)[1].lower()
        if ext not in (".jpg", ".jpeg", ".png"):
            print("[错误] 仅支持 JPG/JPEG/PNG 格式")
            return
        if which == 1:
            self.image_path1 = final_path
        else:
            self.image_path2 = final_path
        print(f"[已设置{label}] {final_path}")

    def _build_image_content(self, text: str) -> list:
        content = [{"type": "text", "text": text}]
        for p in [self.image_path1, self.image_path2]:
            if not p:
                continue
            ext = os.path.splitext(p)[1].lower()
            mime = "image/jpeg" if ext in (".jpg", ".jpeg") else "image/png"
            with open(p, "rb") as f:
                b64 = base64.b64encode(f.read()).decode("utf-8")
            content.append({"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}})
        return content

    def handle_record(self, duration_sec: int):
        print(f"[操作] 开始录音 {duration_sec} 秒…")
        audio_file = self.processor.record(duration_sec)
        if not audio_file:
            print("[错误] 录音失败")
            return
        self.last_audio = audio_file
        try:
            with wave.open(audio_file, "rb") as wf:
                print(f"[音频信息] rate={wf.getframerate()}, ch={wf.getnchannels()}, bits={wf.getsampwidth()*8}")
        except Exception:
            pass
        wav_path = self.processor.convert_to_wav(audio_file)
        if not wav_path:
            print("[错误] 转换 WAV 失败")
            return
        self.last_wav = wav_path
        print("[识别] 讯飞实时识别…")
        text = self.asr.transcribe_audio_ws(wav_path)
        if not text:
            print("[识别失败] 未获取到文本")
            return
        print(f"[识别结果] {text}")

        print("[豆包] 提交两图比较分析…")
        try:
            sys_prompt = getattr(exp03, "ROOT_CONFIG", None)
            sys_prompt = getattr(sys_prompt, "SYSTEM_PROMPT", None) if sys_prompt else None
            messages = []
            if sys_prompt:
                messages.append({"role": "system", "content": sys_prompt})
            messages.append({"role": "user", "content": self._build_image_content(text)})
            result = self.doubao._make_request(messages)
            if result and result.get("choices"):
                print("[豆包回复]", result["choices"][0]["message"]["content"])
            else:
                print("[豆包回复] None")
        except Exception as e:
            print("[豆包错误]", e)

    def handle_play(self):
        if not self.last_audio:
            print("[提示] 尚无可回放的录音。请先使用 r 指令录音。")
            return
        print("[播放] 回放最近一次录音…")
        self.processor.play(self.last_audio)

    def run(self):
        print("\n=== 04 图片比较（语音选择两图 + 讯飞 + 豆包）实验 ===")
        self.print_help()
        try:
            first = input("是否先选择图片一? (y/n): ").strip().lower()
            if first.startswith("y"):
                self._select_image(1)
            second = input("是否选择图片二? (y/n): ").strip().lower()
            if second.startswith("y"):
                self._select_image(2)
        except Exception:
            pass
        while True:
            try:
                cmd = input("请输入指令 (i1/i2/r/p/h/q): ").strip()
            except (EOFError, KeyboardInterrupt):
                print("\n[退出]")
                break
            if not cmd:
                continue
            if cmd == "q":
                print("[退出]")
                break
            if cmd == "h":
                self.print_help()
                continue
            if cmd == "p":
                self.handle_play()
                continue
            if cmd == "i1":
                self._select_image(1)
                continue
            if cmd == "i2":
                self._select_image(2)
                continue
            if cmd.startswith("r"):
                parts = cmd.split()
                duration = 5
                if len(parts) >= 2:
                    try:
                        duration = int(parts[1])
                    except Exception:
                        print("[提示] 秒数无效，使用默认 5 秒")
                self.handle_record(duration)
                continue
            print("[提示] 未知指令。输入 h 查看帮助。")


if __name__ == "__main__":
    VoiceImageComparisonApp().run()