Commit 91f79dc6 by luoqi

feat(assistant): 语音输入升级实时听写 — 点击开启,边说边出字,再点纯退出

交互(用户要求):点 🎤 开始 → 说话时文字实时滚动进输入框 → 再点 = 纯退出(已上屏文字保留)。

实现(复用 realtime-coach 成熟模式,ASR 容器零改动):
- 前端:PCM16 16k 采音 + RMS 静音门控(无声不发帧)→ socket.io 推帧;
  dictation:partial(当前句滚动覆盖)/ dictation:final(句定稿累加)→ setInput 实时渲染;
  base 保留输入框已有文字,听写追加其后。
- 后端 DictationGateway(socket.io,JWT 握手鉴权同 coach):按"帧到达间隙"断句 ——
  说话中每 700ms 把当前句 PCM 包 44 字节 WAV 头调 TranscribeService 出 partial;
  停顿 ≥800ms / 超 30s 整句 final 并清缓冲。inFlight 防解码重叠;先清缓冲再 final
  解码(下一句帧不混入)。SenseVoice 离线模型 RTF~0.1 → 句级重解码远快于实时。

实测(模拟浏览器推帧):开口 1.3s 首个 partial,~0.7s/次滚动更新,停顿 1.3s 出整句 final,
文本与一次性识别完全一致。两端 tsc 0。

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 5644d1af
import { Module } from '@nestjs/common'; import { Module } from '@nestjs/common';
import { AiModule } from '../ai/ai.module'; import { AiModule } from '../ai/ai.module';
import { AuthModule } from '../auth/auth.module';
import { AssistantController } from './assistant.controller'; import { AssistantController } from './assistant.controller';
import { AssistantService } from './assistant.service'; import { AssistantService } from './assistant.service';
import { McpClientService } from './mcp-client.service'; import { McpClientService } from './mcp-client.service';
import { TranscribeService } from './transcribe.service'; import { TranscribeService } from './transcribe.service';
import { DictationGateway } from './dictation.gateway';
/** /**
* AssistantModule — "外部 agent" 模拟器(独立设计,不复用 AiCall 单发框架)。 * AssistantModule — "外部 agent" 模拟器(独立设计,不复用 AiCall 单发框架)。
...@@ -11,8 +13,8 @@ import { TranscribeService } from './transcribe.service'; ...@@ -11,8 +13,8 @@ import { TranscribeService } from './transcribe.service';
* 复用 AiModule 的 AiProviderService(provider 可切换);McpClientService 真连 PAC MCP 端点。 * 复用 AiModule 的 AiProviderService(provider 可切换);McpClientService 真连 PAC MCP 端点。
*/ */
@Module({ @Module({
imports: [AiModule], imports: [AiModule, AuthModule],
controllers: [AssistantController], controllers: [AssistantController],
providers: [AssistantService, McpClientService, TranscribeService], providers: [AssistantService, McpClientService, TranscribeService, DictationGateway],
}) })
export class AssistantModule {} export class AssistantModule {}
import { Logger } from '@nestjs/common';
import { JwtService } from '@nestjs/jwt';
import {
MessageBody,
ConnectedSocket,
OnGatewayConnection,
OnGatewayDisconnect,
SubscribeMessage,
WebSocketGateway,
} from '@nestjs/websockets';
import type { Socket } from 'socket.io';
import { TranscribeService } from './transcribe.service';
/**
* DictationGateway — 实时听写(socket.io,模式同 realtime-coach)。
*
* 前端持续推 PCM16 16k 帧(自带 RMS 静音门控 → 没说话就没有帧)。本网关按"帧到达的间隙"断句:
* - 说话中:每 ~700ms 把当前句(累积 PCM 包 44 字节 WAV 头)发 SenseVoice → emit dictation:partial
* - 停顿 ≥800ms:同句最终解码 → emit dictation:final,清缓冲,等下一句
* SenseVoice 是离线模型但 RTF~0.1,句级(≤30s)重解码远快于实时 → 体感"边说边出字"。
* 关闭 = 客户端直接 disconnect(纯退出),已上屏文字留在输入框。
*/
interface DictSession {
chunks: Buffer[];
bytes: number;
lastFrameAt: number;
lastDecodedBytes: number;
lastPartialAt: number;
inFlight: boolean;
timer: NodeJS.Timeout;
}
const PARTIAL_EVERY_MS = 700;
const FINALIZE_GAP_MS = 800;
const MAX_UTTERANCE_BYTES = 30 * 16_000 * 2; // 30s @16k PCM16,防失控
@WebSocketGateway({
namespace: 'pac/v1/assistant/dictation',
cors: { origin: true, credentials: false },
})
export class DictationGateway implements OnGatewayConnection, OnGatewayDisconnect {
private readonly logger = new Logger(DictationGateway.name);
private readonly sessions = new Map<string, DictSession>();
constructor(
private readonly jwt: JwtService,
private readonly transcriber: TranscribeService,
) {}
handleConnection(client: Socket): void {
const token = (client.handshake.query.token as string | undefined) ?? '';
try {
this.jwt.verify(token);
} catch {
client.emit('dictation:error', { message: '鉴权失败' });
client.disconnect();
return;
}
const session: DictSession = {
chunks: [],
bytes: 0,
lastFrameAt: 0,
lastDecodedBytes: 0,
lastPartialAt: 0,
inFlight: false,
timer: setInterval(() => void this.tick(client), 250),
};
this.sessions.set(client.id, session);
}
handleDisconnect(client: Socket): void {
const s = this.sessions.get(client.id);
if (s) clearInterval(s.timer);
this.sessions.delete(client.id);
}
@SubscribeMessage('audio:frame')
onFrame(@ConnectedSocket() client: Socket, @MessageBody() msg: { frame?: string }): void {
const s = this.sessions.get(client.id);
if (!s || !msg?.frame) return;
const buf = Buffer.from(msg.frame, 'base64');
if (buf.length === 0) return;
s.chunks.push(buf);
s.bytes += buf.length;
s.lastFrameAt = Date.now();
}
/** 周期检查:说话中出 partial;停顿/超长出 final。 */
private async tick(client: Socket): Promise<void> {
const s = this.sessions.get(client.id);
if (!s || s.inFlight || s.bytes === 0) return;
const now = Date.now();
const gap = now - s.lastFrameAt;
const shouldFinal = gap >= FINALIZE_GAP_MS || s.bytes >= MAX_UTTERANCE_BYTES;
const shouldPartial =
!shouldFinal && now - s.lastPartialAt >= PARTIAL_EVERY_MS && s.bytes > s.lastDecodedBytes;
if (!shouldFinal && !shouldPartial) return;
const pcm = Buffer.concat(s.chunks, s.bytes);
if (shouldFinal) {
// 先清缓冲再解码:下一句的帧不会混进本句
s.chunks = [];
s.bytes = 0;
s.lastDecodedBytes = 0;
} else {
s.lastDecodedBytes = pcm.length;
s.lastPartialAt = now;
}
s.inFlight = true;
try {
const { text } = await this.transcriber.transcribe(wrapWav(pcm), 'u.wav', 'audio/wav');
if (text) client.emit(shouldFinal ? 'dictation:final' : 'dictation:partial', { text });
else if (shouldFinal) client.emit('dictation:final', { text: '' });
} catch (e) {
this.logger.warn(`dictation decode failed: ${e instanceof Error ? e.message : String(e)}`);
client.emit('dictation:error', { message: '识别失败' });
} finally {
s.inFlight = false;
}
}
}
/// PCM16 16k 单声道 → WAV(44 字节标准头)。ASR 容器统一吃容器格式,WAV 最稳。
function wrapWav(pcm: Buffer, sampleRate = 16_000): Buffer {
const h = Buffer.alloc(44);
h.write('RIFF', 0);
h.writeUInt32LE(36 + pcm.length, 4);
h.write('WAVE', 8);
h.write('fmt ', 12);
h.writeUInt32LE(16, 16); // PCM chunk size
h.writeUInt16LE(1, 20); // PCM format
h.writeUInt16LE(1, 22); // mono
h.writeUInt32LE(sampleRate, 24);
h.writeUInt32LE(sampleRate * 2, 28); // byte rate
h.writeUInt16LE(2, 32); // block align
h.writeUInt16LE(16, 34); // bits
h.write('data', 36);
h.writeUInt32LE(pcm.length, 40);
return Buffer.concat([h, pcm]);
}
...@@ -17,9 +17,16 @@ import { ...@@ -17,9 +17,16 @@ import {
Square, Square,
Wrench, Wrench,
} from 'lucide-react'; } from 'lucide-react';
import { io, type Socket } from 'socket.io-client';
import { cn } from '@/lib/utils'; import { cn } from '@/lib/utils';
import { env } from '@/lib/env';
import { useAuthStore } from '@/stores/auth-store';
import {
bytesToBase64,
downsampleTo16k,
floatToPcm16,
} from '@/components/realtime-coach/audio-utils';
import { import {
transcribeAudio,
useAssistantChat, useAssistantChat,
type Artifact, type Artifact,
type Block, type Block,
...@@ -451,55 +458,97 @@ export function AssistantChat() { ...@@ -451,55 +458,97 @@ export function AssistantChat() {
if (taRef.current) taRef.current.style.height = 'auto'; if (taRef.current) taRef.current.style.height = 'auto';
}; };
// ── 语音听写:点击开始/停止 → 上传转写 → 文字落输入框(可编辑再发)── // ── 实时听写:点击开启 → 边说边出字到输入框;再点 = 纯退出(已上屏文字保留)──
const [voice, setVoice] = useState<'idle' | 'recording' | 'busy'>('idle'); // 链路:麦克风 PCM16 16k(RMS 静音门控)→ socket.io → 网关按停顿断句调 SenseVoice
const recRef = useRef<MediaRecorder | null>(null); // dictation:partial = 当前句滚动识别(覆盖式);dictation:final = 本句定稿(累加)
const chunksRef = useRef<Blob[]>([]); const [voice, setVoice] = useState<'idle' | 'live'>('idle');
const [voiceErr, setVoiceErr] = useState<string | null>(null); const [voiceErr, setVoiceErr] = useState<string | null>(null);
const dictRef = useRef<{
socket: Socket;
stream: MediaStream;
ctx: AudioContext;
base: string;
finalized: string;
} | null>(null);
const stopVoice = () => {
const d = dictRef.current;
dictRef.current = null;
if (d) {
d.socket.disconnect();
d.ctx.close().catch(() => undefined);
d.stream.getTracks().forEach((t) => t.stop());
}
setVoice('idle');
};
const toggleVoice = async () => { const toggleVoice = async () => {
setVoiceErr(null); setVoiceErr(null);
if (voice === 'recording') { if (voice === 'live') {
recRef.current?.stop(); stopVoice();
return;
}
const token = useAuthStore.getState().accessToken;
if (!token) {
setVoiceErr('未鉴权,请重新登录');
return; return;
} }
if (voice !== 'idle') return;
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const mime = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: MediaRecorder.isTypeSupported('audio/mp4')
? 'audio/mp4'
: '';
const rec = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined);
chunksRef.current = [];
rec.ondataavailable = (e) => {
if (e.data.size > 0) chunksRef.current.push(e.data);
};
rec.onstop = async () => {
stream.getTracks().forEach((t) => t.stop());
setVoice('busy');
try { try {
const blob = new Blob(chunksRef.current, { type: rec.mimeType || 'audio/webm' }); const stream = await navigator.mediaDevices.getUserMedia({
const text = await transcribeAudio(blob); audio: { echoCancellation: true, noiseSuppression: true, autoGainControl: true },
if (text) { });
setInput((prev) => (prev.trim() ? `${prev.trimEnd()} ${text}` : text)); const socket = io(`${env.apiBaseUrl}/pac/v1/assistant/dictation`, {
taRef.current?.focus(); query: { token },
transports: ['websocket'],
forceNew: true,
});
const ctx = new AudioContext();
const source = ctx.createMediaStreamSource(stream);
const proc = ctx.createScriptProcessor(4096, 1, 1);
// 静音门控:无声不发帧 → 帧间隙就是网关的断句信号(同 realtime-coach 参数)
const SILENCE_RMS = 0.03;
const ONSET_FRAMES = 2;
const HANGOVER_FRAMES = 6;
let hangover = 0;
let loudStreak = 0;
proc.onaudioprocess = (e) => {
if (socket.disconnected) return;
const f32 = e.inputBuffer.getChannelData(0);
let sum = 0;
for (let i = 0; i < f32.length; i++) sum += f32[i]! * f32[i]!;
const rms = Math.sqrt(sum / f32.length);
if (rms > SILENCE_RMS) {
loudStreak++;
if (loudStreak >= ONSET_FRAMES) hangover = HANGOVER_FRAMES;
} else { } else {
setVoiceErr('没听清,请再说一次'); loudStreak = 0;
}
} catch (err) {
setVoiceErr(err instanceof Error ? err.message : '识别失败');
} finally {
setVoice('idle');
} }
if (hangover <= 0) return;
hangover--;
const pcm = floatToPcm16(downsampleTo16k(f32, ctx.sampleRate));
socket.emit('audio:frame', { frame: bytesToBase64(pcm) });
}; };
recRef.current = rec; source.connect(proc);
rec.start(); proc.connect(ctx.destination);
setVoice('recording');
const d = { socket, stream, ctx, base: input.trim() ? `${input.trimEnd()} ` : '', finalized: '' };
dictRef.current = d;
const render = (partial: string) => setInput(`${d.base}${d.finalized}${partial}`);
socket.on('dictation:partial', ({ text }: { text: string }) => render(text));
socket.on('dictation:final', ({ text }: { text: string }) => {
if (text) d.finalized += text;
render('');
});
socket.on('dictation:error', ({ message }: { message: string }) => setVoiceErr(message));
socket.on('connect_error', () => {
setVoiceErr('听写连接失败');
stopVoice();
});
setVoice('live');
taRef.current?.focus();
} catch { } catch {
setVoiceErr('无法访问麦克风(请检查浏览器授权)'); setVoiceErr('无法访问麦克风(请检查浏览器授权)');
setVoice('idle'); stopVoice();
} }
}; };
...@@ -579,18 +628,15 @@ export function AssistantChat() { ...@@ -579,18 +628,15 @@ export function AssistantChat() {
<button <button
type="button" type="button"
onClick={toggleVoice} onClick={toggleVoice}
disabled={voice === 'busy'} title={voice === 'live' ? '结束听写' : '语音输入(边说边出字)'}
title={voice === 'recording' ? '停止录音' : '语音输入(点击说话)'}
className={cn( className={cn(
'mb-0.5 inline-flex h-8 w-8 flex-none items-center justify-center rounded-lg transition-colors', 'mb-0.5 inline-flex h-8 w-8 flex-none items-center justify-center rounded-lg transition-colors',
voice === 'recording' voice === 'live'
? 'animate-pulse bg-rose-100 text-rose-600 ring-1 ring-inset ring-rose-300' ? 'animate-pulse bg-rose-100 text-rose-600 ring-1 ring-inset ring-rose-300'
: voice === 'busy'
? 'bg-slate-100 text-slate-400'
: 'bg-slate-100 text-slate-500 hover:bg-teal-50 hover:text-teal-700', : 'bg-slate-100 text-slate-500 hover:bg-teal-50 hover:text-teal-700',
)} )}
> >
{voice === 'busy' ? <Loader2 className="h-4 w-4 animate-spin" /> : <Mic className="h-4 w-4" />} <Mic className="h-4 w-4" />
</button> </button>
{status === 'streaming' ? ( {status === 'streaming' ? (
<button <button
...@@ -619,8 +665,8 @@ export function AssistantChat() { ...@@ -619,8 +665,8 @@ export function AssistantChat() {
<p className="mt-1.5 text-center text-[10.5px] text-slate-400"> <p className="mt-1.5 text-center text-[10.5px] text-slate-400">
{voiceErr ? ( {voiceErr ? (
<span className="text-rose-500">{voiceErr}</span> <span className="text-rose-500">{voiceErr}</span>
) : voice === 'recording' ? ( ) : voice === 'live' ? (
<span className="text-rose-500">录音中… 再点一下麦克风结束</span> <span className="text-rose-500">听写中,边说边出字… 再点麦克风退出</span>
) : ( ) : (
'结果仅供参考 请核对后使用' '结果仅供参考 请核对后使用'
)} )}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment