Commit 5644d1af by luoqi

feat(assistant): 语音输入 — 自部署 SenseVoice-small 听写(PII 不出内网)

三层:
- apps/asr-sensevoice:sherpa-onnx + SenseVoice-small int8(纯 CPU,无 torch,镜像 <400MB);
  FastAPI /transcribe:任意浏览器音频(webm/mp4)→ ffmpeg 16k 单声道 → 离线识别(ITN 标点)。
  模型不进镜像/git —— 卷挂载 ${PAC_MODELS_DIR:-../pac-models}/sensevoice(model.int8.onnx + tokens.txt,
  来源 sherpa-onnx releases sense-voice-zh-en-ja-ko-yue-2024-07-17)。内网专用不发布端口(prod)。
- 后端:POST /assistant/transcribe(全局 JWT 鉴权,multipart ≤15MB)→ TranscribeService 转发
  PAC_ASR_URL(provider 单一出口,以后切云 ASR 改这一处)。
- 前端:composer 加 🎤(点击录音/再点结束,红色脉冲态)→ MediaRecorder(webm/opus,Safari mp4)
  → 上传转写 → 文字落输入框可编辑再发;错误/录音中提示在底部状态行。

compose:dev + prod 都加 pac-asr 服务(prod 限 cpus:2/mem:2g,阿里云镜像源构建参数);
pac-service 注入 PAC_ASR_URL=http://pac-asr:8000;deploy 脚本 SERVICES 加 pac-asr。

本地验证:容器直测 3.9s 音频 0.46s 出字"帮我查一下患者孙科的画像和召回计划。";
经后端鉴权链路同样通过;无 token 10106 拒。两端 tsc 0。暂不部署服务器(模型待 scp)。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 4c93c669
# PAC ASR — SenseVoice-small(sherpa-onnx,纯 CPU,无 torch)
# 模型经卷挂载(SENSEVOICE_DIR),镜像本身 < 400MB。
# 国内服务器构建提速:--build-arg DEBIAN_MIRROR=mirrors.aliyun.com --build-arg PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple
FROM python:3.11-slim
ARG PIP_INDEX_URL=https://pypi.org/simple
ARG DEBIAN_MIRROR=
RUN if [ -n "$DEBIAN_MIRROR" ]; then \
sed -i "s/deb.debian.org/${DEBIAN_MIRROR}/g" /etc/apt/sources.list.d/debian.sources; \
fi \
&& apt-get update \
&& apt-get install -y --no-install-recommends ffmpeg \
&& rm -rf /var/lib/apt/lists/*
RUN pip install --no-cache-dir -i ${PIP_INDEX_URL} \
"sherpa-onnx>=1.10,<2" numpy fastapi "uvicorn[standard]" python-multipart
WORKDIR /app
COPY apps/asr-sensevoice/server.py .
EXPOSE 8000
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
"""PAC ASR 服务 — SenseVoice-small(sherpa-onnx,纯 CPU)。
听写转文字:浏览器录音(webm/opus、mp4 等)→ ffmpeg 解码 16k 单声道 → SenseVoice 离线识别。
模型不打进镜像 —— 挂载卷 SENSEVOICE_DIR(默认 /models/sensevoice,需含 model.int8.onnx + tokens.txt)。
内网专用(compose 网络内供 pac-service 调),无鉴权 —— 不要对外发布端口。
"""
import os
import subprocess
import tempfile
import wave
import numpy as np
import sherpa_onnx
from fastapi import FastAPI, File, HTTPException, UploadFile
MODEL_DIR = os.environ.get("SENSEVOICE_DIR", "/models/sensevoice")
NUM_THREADS = int(os.environ.get("ASR_THREADS", "2"))
MAX_BYTES = 15 * 1024 * 1024 # 听写片段上限 ~15MB(几分钟足够)
recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
model=os.path.join(MODEL_DIR, "model.int8.onnx"),
tokens=os.path.join(MODEL_DIR, "tokens.txt"),
use_itn=True, # 数字/标点反归一(电话号、日期更可读)
language="auto", # zh/en/粤 自动
num_threads=NUM_THREADS,
)
app = FastAPI()
@app.get("/health")
def health() -> dict:
return {"ok": True, "model": "sensevoice-small-int8"}
@app.post("/transcribe")
async def transcribe(file: UploadFile = File(...)) -> dict:
raw = await file.read()
if not raw:
raise HTTPException(400, "empty audio")
if len(raw) > MAX_BYTES:
raise HTTPException(413, "audio too large")
suffix = os.path.splitext(file.filename or "")[1] or ".webm"
src = wav = None
try:
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
f.write(raw)
src = f.name
wav = src + ".wav"
# 任意浏览器容器格式 → 16k 单声道 wav(失败=非音频/损坏)
p = subprocess.run(
["ffmpeg", "-y", "-i", src, "-ar", "16000", "-ac", "1", "-f", "wav", wav],
capture_output=True,
timeout=60,
)
if p.returncode != 0:
raise HTTPException(400, f"audio decode failed: {p.stderr.decode()[-200:]}")
with wave.open(wav) as w:
sr = w.getframerate()
samples = np.frombuffer(w.readframes(w.getnframes()), dtype=np.int16).astype(np.float32) / 32768.0
if len(samples) < sr // 5: # <0.2s 视为误触
return {"text": "", "durationSec": round(len(samples) / sr, 2)}
s = recognizer.create_stream()
s.accept_waveform(sr, samples)
recognizer.decode_stream(s)
return {"text": s.result.text, "durationSec": round(len(samples) / sr, 2)}
finally:
for fp in (src, wav):
if fp:
try:
os.unlink(fp)
except OSError:
pass
import { Body, Controller, Post, Req, Res } from '@nestjs/common';
import { Body, Controller, Post, Req, Res, UploadedFile, UseInterceptors } from '@nestjs/common';
import { FileInterceptor } from '@nestjs/platform-express';
import type { Request, Response } from 'express';
import { ApiBearerAuth, ApiOperation, ApiTags } from '@nestjs/swagger';
import { ApiBearerAuth, ApiConsumes, ApiOperation, ApiTags } from '@nestjs/swagger';
import type { ModelMessage } from 'ai';
import { AssistantService } from './assistant.service';
import { TranscribeService } from './transcribe.service';
/// multer 内存模式的最小文件形状(不引 @types/multer)
interface UploadedAudio {
buffer: Buffer;
originalname: string;
mimetype: string;
}
interface ChatBody {
messages: ModelMessage[];
......@@ -56,7 +65,24 @@ function extractHtmlField(jsonText: string): string | null {
@ApiBearerAuth('accessToken')
@Controller('assistant')
export class AssistantController {
constructor(private readonly assistant: AssistantService) {}
constructor(
private readonly assistant: AssistantService,
private readonly transcriber: TranscribeService,
) {}
/** 语音听写 → 文字(SenseVoice 自部署,PII 不出内网)。multipart 字段名 file。 */
@Post('transcribe')
@ApiConsumes('multipart/form-data')
@ApiOperation({ summary: '语音转文字(听写)— 浏览器录音上传,返回 { text }' })
@UseInterceptors(FileInterceptor('file', { limits: { fileSize: 15 * 1024 * 1024 } }))
async transcribe(@UploadedFile() file: UploadedAudio | undefined): Promise<{ text: string }> {
const r = await this.transcriber.transcribe(
file?.buffer ?? Buffer.alloc(0),
file?.originalname ?? 'audio.webm',
file?.mimetype ?? 'audio/webm',
);
return { text: r.text };
}
@Post('chat')
@ApiOperation({ summary: '助手对话(SSE)— 模型自主调 PAC MCP 工具' })
......
......@@ -3,6 +3,7 @@ import { AiModule } from '../ai/ai.module';
import { AssistantController } from './assistant.controller';
import { AssistantService } from './assistant.service';
import { McpClientService } from './mcp-client.service';
import { TranscribeService } from './transcribe.service';
/**
* AssistantModule — "外部 agent" 模拟器(独立设计,不复用 AiCall 单发框架)。
......@@ -12,6 +13,6 @@ import { McpClientService } from './mcp-client.service';
@Module({
imports: [AiModule],
controllers: [AssistantController],
providers: [AssistantService, McpClientService],
providers: [AssistantService, McpClientService, TranscribeService],
})
export class AssistantModule {}
import { BadRequestException, Injectable, Logger, ServiceUnavailableException } from '@nestjs/common';
export interface TranscribeResult {
text: string;
durationSec?: number;
}
/**
* TranscribeService — 语音听写转文字(provider 可切换;现仅 sensevoice)。
*
* sensevoice = 自部署 SenseVoice-small 容器(apps/asr-sensevoice,内网 HTTP)。
* 选它的原因:患者语音可能含姓名/手机号(医疗 PII)→ 不出内网;零按量费;CPU 即可。
* 以后要换云 ASR(火山/讯飞)→ 加 provider 分支 + 环境变量切换,前后端契约不变。
*/
@Injectable()
export class TranscribeService {
private readonly logger = new Logger(TranscribeService.name);
// compose 内走服务名 pac-asr;本地裸跑 node 时用发布到宿主的端口
private readonly url = process.env.PAC_ASR_URL ?? 'http://127.0.0.1:8008';
async transcribe(buf: Buffer, filename: string, mimetype: string): Promise<TranscribeResult> {
if (!buf?.length) throw new BadRequestException('empty audio');
const form = new FormData();
form.append('file', new Blob([new Uint8Array(buf)], { type: mimetype || 'audio/webm' }), filename || 'audio.webm');
let res: Response;
try {
res = await fetch(`${this.url}/transcribe`, {
method: 'POST',
body: form,
signal: AbortSignal.timeout(60_000),
});
} catch (err) {
this.logger.error(`ASR 服务不可达(${this.url}): ${err instanceof Error ? err.message : String(err)}`);
throw new ServiceUnavailableException('语音识别服务暂不可用');
}
if (!res.ok) {
const text = await res.text();
this.logger.warn(`ASR ${res.status}: ${text.slice(0, 200)}`);
if (res.status === 400 || res.status === 413) throw new BadRequestException('音频无法识别(格式/大小)');
throw new ServiceUnavailableException('语音识别失败');
}
return (await res.json()) as TranscribeResult;
}
}
......@@ -11,6 +11,7 @@ import {
LayoutTemplate,
Loader2,
Maximize2,
Mic,
Send,
Sparkles,
Square,
......@@ -18,6 +19,7 @@ import {
} from 'lucide-react';
import { cn } from '@/lib/utils';
import {
transcribeAudio,
useAssistantChat,
type Artifact,
type Block,
......@@ -449,6 +451,58 @@ export function AssistantChat() {
if (taRef.current) taRef.current.style.height = 'auto';
};
// ── 语音听写:点击开始/停止 → 上传转写 → 文字落输入框(可编辑再发)──
const [voice, setVoice] = useState<'idle' | 'recording' | 'busy'>('idle');
const recRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<Blob[]>([]);
const [voiceErr, setVoiceErr] = useState<string | null>(null);
const toggleVoice = async () => {
setVoiceErr(null);
if (voice === 'recording') {
recRef.current?.stop();
return;
}
if (voice !== 'idle') return;
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const mime = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: MediaRecorder.isTypeSupported('audio/mp4')
? 'audio/mp4'
: '';
const rec = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined);
chunksRef.current = [];
rec.ondataavailable = (e) => {
if (e.data.size > 0) chunksRef.current.push(e.data);
};
rec.onstop = async () => {
stream.getTracks().forEach((t) => t.stop());
setVoice('busy');
try {
const blob = new Blob(chunksRef.current, { type: rec.mimeType || 'audio/webm' });
const text = await transcribeAudio(blob);
if (text) {
setInput((prev) => (prev.trim() ? `${prev.trimEnd()} ${text}` : text));
taRef.current?.focus();
} else {
setVoiceErr('没听清,请再说一次');
}
} catch (err) {
setVoiceErr(err instanceof Error ? err.message : '识别失败');
} finally {
setVoice('idle');
}
};
recRef.current = rec;
rec.start();
setVoice('recording');
} catch {
setVoiceErr('无法访问麦克风(请检查浏览器授权)');
setVoice('idle');
}
};
return (
<div className="flex h-[100dvh] flex-col bg-slate-50">
{/* Header */}
......@@ -522,6 +576,22 @@ export function AssistantChat() {
placeholder="问关于患者画像 / 事实 / 召回计划的问题… (Enter 发送,Shift+Enter 换行)"
className="max-h-[140px] flex-1 resize-none bg-transparent py-1 text-[13.5px] leading-relaxed text-slate-800 placeholder:text-slate-400 focus:outline-none"
/>
<button
type="button"
onClick={toggleVoice}
disabled={voice === 'busy'}
title={voice === 'recording' ? '停止录音' : '语音输入(点击说话)'}
className={cn(
'mb-0.5 inline-flex h-8 w-8 flex-none items-center justify-center rounded-lg transition-colors',
voice === 'recording'
? 'animate-pulse bg-rose-100 text-rose-600 ring-1 ring-inset ring-rose-300'
: voice === 'busy'
? 'bg-slate-100 text-slate-400'
: 'bg-slate-100 text-slate-500 hover:bg-teal-50 hover:text-teal-700',
)}
>
{voice === 'busy' ? <Loader2 className="h-4 w-4 animate-spin" /> : <Mic className="h-4 w-4" />}
</button>
{status === 'streaming' ? (
<button
type="button"
......@@ -547,7 +617,13 @@ export function AssistantChat() {
)}
</div>
<p className="mt-1.5 text-center text-[10.5px] text-slate-400">
结果仅供参考 请核对后使用
{voiceErr ? (
<span className="text-rose-500">{voiceErr}</span>
) : voice === 'recording' ? (
<span className="text-rose-500">● 录音中… 再点一下麦克风结束</span>
) : (
'结果仅供参考 请核对后使用'
)}
</p>
</div>
</div>
......
......@@ -95,6 +95,23 @@ function toApiMessage(m: ChatMessage): { role: 'user' | 'assistant'; content: st
return { role: m.role, content: text };
}
/** 听写音频上传 → 文字(后端 /assistant/transcribe → 自部署 SenseVoice,PII 不出内网)。 */
export async function transcribeAudio(blob: Blob): Promise<string> {
const token = useAuthStore.getState().accessToken;
const form = new FormData();
const ext = blob.type.includes('mp4') ? 'mp4' : blob.type.includes('ogg') ? 'ogg' : 'webm';
form.append('file', blob, `dictation.${ext}`);
const res = await fetch(new URL('/pac/v1/assistant/transcribe', env.apiBaseUrl), {
method: 'POST',
headers: token ? { Authorization: `Bearer ${token}` } : undefined,
body: form,
});
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const json = (await res.json()) as { code?: number; msg?: string; data?: { text?: string } };
if (json.code !== 0) throw new Error(json.msg || '识别失败');
return json.data?.text ?? '';
}
export function useAssistantChat() {
const [messages, setMessages] = useState<ChatMessage[]>([]);
const [status, setStatus] = useState<ChatStatus>('idle');
......
......@@ -24,7 +24,7 @@ main() {
local COMPOSE=(docker compose -f docker-compose.prod.yml
--env-file apps/pac-service/.env --env-file apps/pac-web/.env)
local SERVICES=(pac-migrate pac-service pac-web)
local SERVICES=(pac-migrate pac-service pac-web pac-asr)
if [[ "${1:-}" != "--no-pull" ]]; then
log "git pull --ff-only"
......
......@@ -79,9 +79,28 @@ services:
# 覆盖 .env 的 localhost URL → 走 docker 内部网络
DATABASE_URL: postgresql://${POSTGRES_USER:-pac}:${POSTGRES_PASSWORD:-pac}@postgres:5432/${POSTGRES_DB:-pac}?schema=public
REDIS_URL: redis://redis:6379
# 语音听写 ASR(自部署 SenseVoice,内网)
PAC_ASR_URL: http://pac-asr:8000
ports:
- "127.0.0.1:3101:3101"
# 语音听写 ASR — SenseVoice-small(CPU)。模型不进镜像:
# 服务器准备 ${PAC_MODELS_DIR:-../pac-models}/sensevoice/{model.int8.onnx,tokens.txt}
pac-asr:
build:
context: .
dockerfile: apps/asr-sensevoice/Dockerfile
args:
# 服务器在阿里云 — 用国内镜像源提速
PIP_INDEX_URL: https://mirrors.aliyun.com/pypi/simple
DEBIAN_MIRROR: mirrors.aliyun.com
restart: always
volumes:
- ${PAC_MODELS_DIR:-../pac-models}/sensevoice:/models/sensevoice:ro
# 不发布端口 — 仅 compose 内网供 pac-service 调用(无鉴权)
cpus: 2
mem_limit: 2g
pac-web:
build:
context: .
......
......@@ -92,6 +92,18 @@ services:
- /app/node_modules
- /app/apps/pac-service/node_modules
# 语音听写 ASR — SenseVoice-small(本地 dev:PAC_MODELS_DIR 指向模型目录,如 /tmp/pac-models)
pac-asr:
build:
context: .
dockerfile: apps/asr-sensevoice/Dockerfile
container_name: pac-asr
restart: unless-stopped
volumes:
- ${PAC_MODELS_DIR:-../pac-models}/sensevoice:/models/sensevoice:ro
ports:
- "127.0.0.1:8008:8000"
pac-web:
build:
context: .
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment