feat: ✨ 语音识别

2026-01-01 21:34:51 +08:00 · 2026-01-01 21:34:51 +08:00 · 35c9b9eb58
commit 35c9b9eb58
parent 48fe2f37ae
3 changed files with 370 additions and 0 deletions
--- a/src/Module/asr/asr.py
+++ b/src/Module/asr/asr.py
@ -0,0 +1,238 @@
+"""
+语音识别模块 - 基于阿里云 DashScope 通义千问 ASR
+支持音频文件识别
+"""
+
+import os
+from typing import Optional
+from pathlib import Path
+from dataclasses import dataclass
+from dotenv import load_dotenv
+
+import dashscope
+from dashscope import MultiModalConversation
+
+from . import asrconfig as config
+
+
+@dataclass
+class ASRResult:
+    """识别结果"""
+    text: str                       # 识别文本
+    success: bool = True            # 是否成功
+    error: Optional[str] = None     # 错误信息
+    request_id: Optional[str] = None  # 请求ID
+
+
+class ASR:
+    """
+    语音识别类
+
+    使用方式:
+        asr = ASR()
+        result = asr.recognize("audio.wav")
+        print(result.text)
+    """
+
+    # 支持的格式
+    SUPPORTED_FORMATS = config.SUPPORTED_FORMATS
+
+    def __init__(
+        self,
+        model: str = None,
+        language: str = None,
+        enable_itn: bool = None,
+        context: str = None,
+    ):
+        """
+        初始化 ASR
+
+        Args:
+            model: 模型名称 ('qwen3-asr-flash' 或 'qwen3-asr-flash-filetrans')
+            language: 语言代码 ('zh', 'en', 等)，None 为自动检测
+            enable_itn: 是否启用 ITN
+            context: 上下文增强文本
+        """
+        self._load_api_key()
+
+        self.model = model or config.MODEL
+        self.language = language if language is not None else config.LANGUAGE
+        self.enable_itn = enable_itn if enable_itn is not None else config.ENABLE_ITN
+        self.context = context or ""
+
+        # 设置 API URL
+        dashscope.base_http_api_url = config.API_URL
+
+    def _load_api_key(self) -> None:
+        """从 .env 加载 API Key"""
+        current_dir = Path(__file__).parent
+        for _ in range(5):
+            env_path = current_dir / '.env'
+            if env_path.exists():
+                load_dotenv(env_path)
+                break
+            current_dir = current_dir.parent
+
+        api_key = os.environ.get('DASHSCOPE_API_KEY')
+        if not api_key:
+            raise ValueError('未找到 DASHSCOPE_API_KEY')
+
+    def recognize(
+        self,
+        audio_path: str,
+        language: str = None,
+        context: str = None,
+    ) -> ASRResult:
+        """
+        识别音频文件
+
+        Args:
+            audio_path: 音频文件路径（本地路径或 URL）
+            language: 临时覆盖语言设置
+            context: 临时覆盖上下文
+
+        Returns:
+            ASRResult: 识别结果
+        """
+        try:
+            # 处理文件路径
+            audio_uri = self._prepare_audio_uri(audio_path)
+
+            # 构建消息
+            messages = self._build_messages(audio_uri, context)
+
+            # 构建 ASR 选项
+            asr_options = {"enable_itn": self.enable_itn}
+            if language or self.language:
+                asr_options["language"] = language or self.language
+
+            # 调用 API
+            response = MultiModalConversation.call(
+                api_key=os.environ.get('DASHSCOPE_API_KEY'),
+                model=self.model,
+                messages=messages,
+                result_format="message",
+                asr_options=asr_options
+            )
+
+            # 解析结果
+            return self._parse_response(response)
+
+        except Exception as e:
+            return ASRResult(
+                text="",
+                success=False,
+                error=str(e)
+            )
+
+    def _prepare_audio_uri(self, audio_path: str) -> str:
+        """准备音频 URI"""
+        # 如果已经是 URL
+        if audio_path.startswith('http://') or audio_path.startswith('https://'):
+            return audio_path
+
+        # 如果已经是 file:// 格式
+        if audio_path.startswith('file://'):
+            return audio_path
+
+        # 本地文件，转换为 file:// 格式
+        path = Path(audio_path)
+        if not path.exists():
+            raise FileNotFoundError(f"音频文件不存在: {audio_path}")
+
+        # 检查文件大小
+        file_size = path.stat().st_size
+        if file_size > config.MAX_FILE_SIZE:
+            raise ValueError(
+                f"文件大小 ({file_size} bytes) 超过限制 ({
+                    config.MAX_FILE_SIZE} bytes)")
+
+        # 检查格式
+        suffix = path.suffix.lower().lstrip('.')
+        if suffix not in self.SUPPORTED_FORMATS:
+            raise ValueError(f"不支持的音频格式: {suffix}")
+
+        # 转换为绝对路径
+        abs_path = path.resolve()
+        return f"file://{abs_path}"
+
+    def _build_messages(self, audio_uri: str, context: str = None) -> list:
+        """构建消息"""
+        ctx = context or self.context
+
+        messages = [
+            {
+                "role": "system",
+                "content": [{"text": ctx}]
+            },
+            {
+                "role": "user",
+                "content": [{"audio": audio_uri}]
+            }
+        ]
+        return messages
+
+    def _parse_response(self, response) -> ASRResult:
+        """解析 API 响应"""
+        if response.status_code != 200:
+            return ASRResult(
+                text="",
+                success=False,
+                error=f"API 错误: {response.code} - {response.message}",
+                request_id=response.request_id
+            )
+
+        # 提取文本
+        try:
+            content = response.output.choices[0].message.content
+            if isinstance(content, list):
+                text = content[0].get("text", "")
+            else:
+                text = str(content)
+
+            return ASRResult(
+                text=text,
+                success=True,
+                request_id=response.request_id
+            )
+        except Exception as e:
+            return ASRResult(
+                text="",
+                success=False,
+                error=f"解析响应失败: {e}",
+                request_id=getattr(response, 'request_id', None)
+            )
+
+
+# ============================================================
+# 便捷函数
+# ============================================================
+
+def recognize(audio_path: str, **kwargs) -> ASRResult:
+    """
+    便捷的识别函数
+
+    Args:
+        audio_path: 音频文件路径
+        **kwargs: 传递给 ASR 的参数
+
+    Returns:
+        ASRResult: 识别结果
+    """
+    asr = ASR(**kwargs)
+    return asr.recognize(audio_path)
+
+
+def recognize_text(audio_path: str, **kwargs) -> str:
+    """
+    便捷函数，直接返回识别文本
+
+    Args:
+        audio_path: 音频文件路径
+        **kwargs: 传递给 ASR 的参数
+
+    Returns:
+        str: 识别的文本，失败返回空字符串
+    """
+    result = recognize(audio_path, **kwargs)
+    return result.text if result.success else ""
--- a/src/Module/asr/asrconfig.py
+++ b/src/Module/asr/asrconfig.py
@ -0,0 +1,49 @@
+"""
+ASR 配置文件
+定义语音识别的默认参数
+"""
+
+# ============================================================
+# 模型配置
+# ============================================================
+
+# 默认模型
+# qwen3-asr-flash: 短音频 (≤5分钟)
+# qwen3-asr-flash-filetrans: 长音频 (≤12小时)
+MODEL = 'qwen3-asr-flash'
+
+# API URL (北京地域)
+API_URL = 'https://dashscope.aliyuncs.com/api/v1'
+
+# 新加坡地域 URL (备用)
+# API_URL = 'https://dashscope-intl.aliyuncs.com/api/v1'
+
+
+# ============================================================
+# 识别参数
+# ============================================================
+
+# 语言 (可选值: 'zh', 'en', 'ja', 'ko', 'de', 'fr', 'ru', 'es', 'it', 'pt', 'ar', 等)
+# None 表示自动检测
+LANGUAGE = None
+
+# 是否启用 ITN (Inverse Text Normalization)
+# 将口语数字转为书面形式，如"一百二十三"→"123"
+ENABLE_ITN = False
+
+
+# ============================================================
+# 支持的音频格式
+# ============================================================
+
+SUPPORTED_FORMATS = [
+    'aac', 'amr', 'avi', 'aiff', 'flac', 'flv',
+    'm4a', 'mkv', 'mp3', 'mpeg', 'ogg', 'opus',
+    'wav', 'webm', 'wma', 'wmv'
+]
+
+# 最大文件大小 (字节) - 10MB
+MAX_FILE_SIZE = 10 * 1024 * 1024
+
+# 最大音频时长 (秒) - 5分钟
+MAX_DURATION = 5 * 60
--- a/test/asr/test_asr.py
+++ b/test/asr/test_asr.py
@ -0,0 +1,83 @@
+"""
+ASR 语音识别测试
+使用 TTS 生成的音频文件测试识别
+"""
+
+from pathlib import Path
+from src.Module.asr.asr import ASR, recognize, recognize_text
+
+
+def test_recognize_wav():
+    """测试识别 WAV 文件"""
+    print("=" * 60)
+    print("     ASR 语音识别测试")
+    print("=" * 60)
+
+    # 使用 TTS 测试生成的音频文件
+    audio_file = Path(__file__).parent.parent / 'tts' / 'output' / 'stream_test.wav'
+
+    if not audio_file.exists():
+        print(f"[跳过] 音频文件不存在: {audio_file}")
+        print("请先运行 TTS 测试生成音频文件")
+        return False
+
+    print(f"\n音频文件: {audio_file}")
+    print(f"文件大小: {audio_file.stat().st_size / 1024:.1f} KB")
+
+    # 创建 ASR 实例
+    asr = ASR(
+        model='qwen3-asr-flash',
+        language='zh',
+    )
+
+    print("\n开始识别...")
+    result = asr.recognize(str(audio_file))
+
+    print(f"\n识别结果:")
+    print(f"  成功: {result.success}")
+    print(f"  文本: {result.text}")
+    if result.error:
+        print(f"  错误: {result.error}")
+    if result.request_id:
+        print(f"  请求ID: {result.request_id}")
+
+    return result.success
+
+
+def test_convenient_function():
+    """测试便捷函数"""
+    print("\n" + "=" * 60)
+    print("便捷函数测试")
+    print("=" * 60)
+
+    audio_file = Path(__file__).parent.parent / 'tts' / \
+        'output' / 'bidirectional_test.wav'
+
+    if not audio_file.exists():
+        print(f"[跳过] 音频文件不存在: {audio_file}")
+        return False
+
+    print(f"\n音频文件: {audio_file.name}")
+
+    # 使用便捷函数
+    text = recognize_text(str(audio_file), language='zh')
+
+    print(f"识别文本: {text}")
+
+    return len(text) > 0
+
+
+if __name__ == '__main__':
+    results = []
+
+    success1 = test_recognize_wav()
+    results.append(("WAV 文件识别", success1))
+
+    success2 = test_convenient_function()
+    results.append(("便捷函数", success2))
+
+    print("\n" + "=" * 60)
+    print("测试结果:")
+    for name, success in results:
+        status = "✓ 通过" if success else "✗ 失败/跳过"
+        print(f"  {name}: {status}")