feat(assistant): 语音输入升级实时听写 — 点击开启,边说边出字,再点纯退出

交互(用户要求):点 🎤 开始 → 说话时文字实时滚动进输入框 → 再点 = 纯退出(已上屏文字保留)。实现(复用 realtime-coach 成熟模式,ASR 容器零改动): - 前端:PCM16 16k 采音 + RMS 静音门控(无声不发帧)→ socket.io 推帧; dictation:partial(当前句滚动覆盖)/ dictation:final(句定稿累加)→ setInput 实时渲染; base 保留输入框已有文字,听写追加其后。 - 后端 DictationGateway(socket.io,JWT 握手鉴权同 coach):按"帧到达间隙"断句 —— 说话中每 700ms 把当前句 PCM 包 44 字节 WAV 头调 TranscribeService 出 partial; 停顿 ≥800ms / 超 30s 整句 final 并清缓冲。inFlight 防解码重叠;先清缓冲再 final 解码(下一句帧不混入)。SenseVoice 离线模型 RTF~0.1 → 句级重解码远快于实时。实测(模拟浏览器推帧):开口 1.3s 首个 partial,~0.7s/次滚动更新,停顿 1.3s 出整句 final, 文本与一次性识别完全一致。两端 tsc 0。 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

feat(assistant): 语音输入升级实时听写 — 点击开启,边说边出字,再点纯退出
交互(用户要求):点 🎤 开始 → 说话时文字实时滚动进输入框 → 再点 = 纯退出(已上屏文字保留)。实现(复用 realtime-coach 成熟模式,ASR 容器零改动): - 前端:PCM16 16k 采音 + RMS 静音门控(无声不发帧)→ socket.io 推帧; dictation:partial(当前句滚动覆盖)/ dictation:final(句定稿累加)→ setInput 实时渲染; base 保留输入框已有文字,听写追加其后。 - 后端 DictationGateway(socket.io,JWT 握手鉴权同 coach):按"帧到达间隙"断句 —— 说话中每 700ms 把当前句 PCM 包 44 字节 WAV 头调 TranscribeService 出 partial; 停顿 ≥800ms / 超 30s 整句 final 并清缓冲。inFlight 防解码重叠;先清缓冲再 final 解码(下一句帧不混入)。SenseVoice 离线模型 RTF~0.1 → 句级重解码远快于实时。实测(模拟浏览器推帧):开口 1.3s 首个 partial,~0.7s/次滚动更新,停顿 1.3s 出整句 final, 文本与一次性识别完全一致。两端 tsc 0。 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
91f79dc6 · luoqi · 5644d1af · 91f79dc6 · 91f79dc6 · 91f79dc6
Commit 91f79dc6 authored Jun 10, 2026 by luoqi
Showing with 238 additions and 48 deletions

apps/pac-service/src/modules/assistant/assistant.module.ts
+4 -2

apps/pac-service/src/modules/assistant/dictation.gateway.ts
+142 -0

apps/pac-web/src/components/assistant/assistant-chat.tsx
+92 -46

No files found.
--- a/apps/pac-service/src/modules/assistant/assistant.module.ts
+++ b/apps/pac-service/src/modules/assistant/assistant.module.ts
 import { Module } from '@nestjs/common';
 import { AiModule } from '../ai/ai.module';
+import { AuthModule } from '../auth/auth.module';
 import { AssistantController } from './assistant.controller';
 import { AssistantService } from './assistant.service';
 import { McpClientService } from './mcp-client.service';
 import { TranscribeService } from './transcribe.service';
+import { DictationGateway } from './dictation.gateway';
 /**
 * AssistantModule — "外部 agent" 模拟器(独立设计,不复用 AiCall 单发框架)。
@@ -11,8 +13,8 @@ import { TranscribeService } from './transcribe.service';
 * 复用 AiModule 的 AiProviderService(provider 可切换);McpClientService 真连 PAC MCP 端点。
 */
 @Module({
-  imports: [AiModule],
+  imports: [AiModule, AuthModule],
  controllers: [AssistantController],
-  providers: [AssistantService, McpClientService, TranscribeService],
+  providers: [AssistantService, McpClientService, TranscribeService, DictationGateway],
 })
 export class AssistantModule {}
--- a/apps/pac-service/src/modules/assistant/dictation.gateway.ts
+++ b/apps/pac-service/src/modules/assistant/dictation.gateway.ts
+import { Logger } from '@nestjs/common';
+import { JwtService } from '@nestjs/jwt';
+import {
+  MessageBody,
+  ConnectedSocket,
+  OnGatewayConnection,
+  OnGatewayDisconnect,
+  SubscribeMessage,
+  WebSocketGateway,
+} from '@nestjs/websockets';
+import type { Socket } from 'socket.io';
+import { TranscribeService } from './transcribe.service';
+/**
+ * DictationGateway — 实时听写(socket.io,模式同 realtime-coach)。
+ *
+ * 前端持续推 PCM16 16k 帧(自带 RMS 静音门控 → 没说话就没有帧)。本网关按"帧到达的间隙"断句:
+ *   - 说话中:每 ~700ms 把当前句(累积 PCM 包 44 字节 WAV 头)发 SenseVoice → emit dictation:partial
+ *   - 停顿 ≥800ms:同句最终解码 → emit dictation:final,清缓冲,等下一句
+ * SenseVoice 是离线模型但 RTF~0.1,句级(≤30s)重解码远快于实时 → 体感"边说边出字"。
+ * 关闭 = 客户端直接 disconnect(纯退出),已上屏文字留在输入框。
+ */
+interface DictSession {
+  chunks: Buffer[];
+  bytes: number;
+  lastFrameAt: number;
+  lastDecodedBytes: number;
+  lastPartialAt: number;
+  inFlight: boolean;
+  timer: NodeJS.Timeout;
+}
+const PARTIAL_EVERY_MS = 700;
+const FINALIZE_GAP_MS = 800;
+const MAX_UTTERANCE_BYTES = 30 * 16_000 * 2; // 30s @16k PCM16,防失控
+@WebSocketGateway({
+  namespace: 'pac/v1/assistant/dictation',
+  cors: { origin: true, credentials: false },
+})
+export class DictationGateway implements OnGatewayConnection, OnGatewayDisconnect {
+  private readonly logger = new Logger(DictationGateway.name);
+  private readonly sessions = new Map<string, DictSession>();
+  constructor(
+    private readonly jwt: JwtService,
+    private readonly transcriber: TranscribeService,
+  ) {}
+  handleConnection(client: Socket): void {
+    const token = (client.handshake.query.token as string | undefined) ?? '';
+    try {
+      this.jwt.verify(token);
+    } catch {
+      client.emit('dictation:error', { message: '鉴权失败' });
+      client.disconnect();
+      return;
+    }
+    const session: DictSession = {
+      chunks: [],
+      bytes: 0,
+      lastFrameAt: 0,
+      lastDecodedBytes: 0,
+      lastPartialAt: 0,
+      inFlight: false,
+      timer: setInterval(() => void this.tick(client), 250),
+    };
+    this.sessions.set(client.id, session);
+  }
+  handleDisconnect(client: Socket): void {
+    const s = this.sessions.get(client.id);
+    if (s) clearInterval(s.timer);
+    this.sessions.delete(client.id);
+  }
+  @SubscribeMessage('audio:frame')
+  onFrame(@ConnectedSocket() client: Socket, @MessageBody() msg: { frame?: string }): void {
+    const s = this.sessions.get(client.id);
+    if (!s || !msg?.frame) return;
+    const buf = Buffer.from(msg.frame, 'base64');
+    if (buf.length === 0) return;
+    s.chunks.push(buf);
+    s.bytes += buf.length;
+    s.lastFrameAt = Date.now();
+  }
+  /** 周期检查:说话中出 partial;停顿/超长出 final。 */
+  private async tick(client: Socket): Promise<void> {
+    const s = this.sessions.get(client.id);
+    if (!s || s.inFlight || s.bytes === 0) return;
+    const now = Date.now();
+    const gap = now - s.lastFrameAt;
+    const shouldFinal = gap >= FINALIZE_GAP_MS || s.bytes >= MAX_UTTERANCE_BYTES;
+    const shouldPartial =
+      !shouldFinal && now - s.lastPartialAt >= PARTIAL_EVERY_MS && s.bytes > s.lastDecodedBytes;
+    if (!shouldFinal && !shouldPartial) return;
+    const pcm = Buffer.concat(s.chunks, s.bytes);
+    if (shouldFinal) {
+      // 先清缓冲再解码:下一句的帧不会混进本句
+      s.chunks = [];
+      s.bytes = 0;
+      s.lastDecodedBytes = 0;
+    } else {
+      s.lastDecodedBytes = pcm.length;
+      s.lastPartialAt = now;
+    }
+    s.inFlight = true;
+    try {
+      const { text } = await this.transcriber.transcribe(wrapWav(pcm), 'u.wav', 'audio/wav');
+      if (text) client.emit(shouldFinal ? 'dictation:final' : 'dictation:partial', { text });
+      else if (shouldFinal) client.emit('dictation:final', { text: '' });
+    } catch (e) {
+      this.logger.warn(`dictation decode failed: ${e instanceof Error ? e.message : String(e)}`);
+      client.emit('dictation:error', { message: '识别失败' });
+    } finally {
+      s.inFlight = false;
+    }
+  }
+}
+/// PCM16 16k 单声道 → WAV(44 字节标准头)。ASR 容器统一吃容器格式,WAV 最稳。
+function wrapWav(pcm: Buffer, sampleRate = 16_000): Buffer {
+  const h = Buffer.alloc(44);
+  h.write('RIFF', 0);
+  h.writeUInt32LE(36 + pcm.length, 4);
+  h.write('WAVE', 8);
+  h.write('fmt ', 12);
+  h.writeUInt32LE(16, 16); // PCM chunk size
+  h.writeUInt16LE(1, 20); // PCM format
+  h.writeUInt16LE(1, 22); // mono
+  h.writeUInt32LE(sampleRate, 24);
+  h.writeUInt32LE(sampleRate * 2, 28); // byte rate
+  h.writeUInt16LE(2, 32); // block align
+  h.writeUInt16LE(16, 34); // bits
+  h.write('data', 36);
+  h.writeUInt32LE(pcm.length, 40);
+  return Buffer.concat([h, pcm]);
+}
--- a/apps/pac-web/src/components/assistant/assistant-chat.tsx
+++ b/apps/pac-web/src/components/assistant/assistant-chat.tsx
@@ -17,9 +17,16 @@ import {
  Square,
  Wrench,
 } from 'lucide-react';
+import { io, type Socket } from 'socket.io-client';
 import { cn } from '@/lib/utils';
+import { env } from '@/lib/env';
+import { useAuthStore } from '@/stores/auth-store';
+import {
+  bytesToBase64,
+  downsampleTo16k,
+  floatToPcm16,
+} from '@/components/realtime-coach/audio-utils';
 import {
-  transcribeAudio,
  useAssistantChat,
  type Artifact,
  type Block,
@@ -451,55 +458,97 @@ export function AssistantChat() {
    if (taRef.current) taRef.current.style.height = 'auto';
  };
-  // ── 语音听写:点击开始/停止 → 上传转写 → 文字落输入框(可编辑再发)──
+  // ── 实时听写:点击开启 → 边说边出字到输入框;再点 = 纯退出(已上屏文字保留)──
-  const [voice, setVoice] = useState<'idle' | 'recording' | 'busy'>('idle');
+  // 链路:麦克风 PCM16 16k(RMS 静音门控)→ socket.io → 网关按停顿断句调 SenseVoice
-  const recRef = useRef<MediaRecorder | null>(null);
+  //   dictation:partial = 当前句滚动识别(覆盖式);dictation:final = 本句定稿(累加)
-  const chunksRef = useRef<Blob[]>([]);
+  const [voice, setVoice] = useState<'idle' | 'live'>('idle');
  const [voiceErr, setVoiceErr] = useState<string | null>(null);
+  const dictRef = useRef<{
+    socket: Socket;
+    stream: MediaStream;
+    ctx: AudioContext;
+    base: string;
+    finalized: string;
+  } | null>(null);
+  const stopVoice = () => {
+    const d = dictRef.current;
+    dictRef.current = null;
+    if (d) {
+      d.socket.disconnect();
+      d.ctx.close().catch(() => undefined);
+      d.stream.getTracks().forEach((t) => t.stop());
+    }
+    setVoice('idle');
+  };
  const toggleVoice = async () => {
    setVoiceErr(null);
-    if (voice === 'recording') {
+    if (voice === 'live') {
-      recRef.current?.stop();
+      stopVoice();
+      return;
+    }
+    const token = useAuthStore.getState().accessToken;
+    if (!token) {
+      setVoiceErr('未鉴权,请重新登录');
      return;
    }
-    if (voice !== 'idle') return;
-    try {
-      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-      const mime = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
-        ? 'audio/webm;codecs=opus'
-        : MediaRecorder.isTypeSupported('audio/mp4')
-          ? 'audio/mp4'
-          : '';
-      const rec = new MediaRecorder(stream, mime ? { mimeType: mime } : undefined);
-      chunksRef.current = [];
-      rec.ondataavailable = (e) => {
-        if (e.data.size > 0) chunksRef.current.push(e.data);
-      };
-      rec.onstop = async () => {
-        stream.getTracks().forEach((t) => t.stop());
-        setVoice('busy');
    try {
-          const blob = new Blob(chunksRef.current, { type: rec.mimeType || 'audio/webm' });
+      const stream = await navigator.mediaDevices.getUserMedia({
-          const text = await transcribeAudio(blob);
+        audio: { echoCancellation: true, noiseSuppression: true, autoGainControl: true },
-          if (text) {
+      });
-            setInput((prev) => (prev.trim() ? `${prev.trimEnd()} ${text}` : text));
+      const socket = io(`${env.apiBaseUrl}/pac/v1/assistant/dictation`, {
-            taRef.current?.focus();
+        query: { token },
+        transports: ['websocket'],
+        forceNew: true,
+      });
+      const ctx = new AudioContext();
+      const source = ctx.createMediaStreamSource(stream);
+      const proc = ctx.createScriptProcessor(4096, 1, 1);
+      // 静音门控:无声不发帧 → 帧间隙就是网关的断句信号(同 realtime-coach 参数)
+      const SILENCE_RMS = 0.03;
+      const ONSET_FRAMES = 2;
+      const HANGOVER_FRAMES = 6;
+      let hangover = 0;
+      let loudStreak = 0;
+      proc.onaudioprocess = (e) => {
+        if (socket.disconnected) return;
+        const f32 = e.inputBuffer.getChannelData(0);
+        let sum = 0;
+        for (let i = 0; i < f32.length; i++) sum += f32[i]! * f32[i]!;
+        const rms = Math.sqrt(sum / f32.length);
+        if (rms > SILENCE_RMS) {
+          loudStreak++;
+          if (loudStreak >= ONSET_FRAMES) hangover = HANGOVER_FRAMES;
        } else {
-            setVoiceErr('没听清,请再说一次');
+          loudStreak = 0;
-          }
-        } catch (err) {
-          setVoiceErr(err instanceof Error ? err.message : '识别失败');
-        } finally {
-          setVoice('idle');
        }
+        if (hangover <= 0) return;
+        hangover--;
+        const pcm = floatToPcm16(downsampleTo16k(f32, ctx.sampleRate));
+        socket.emit('audio:frame', { frame: bytesToBase64(pcm) });
      };
-      recRef.current = rec;
+      source.connect(proc);
-      rec.start();
+      proc.connect(ctx.destination);
-      setVoice('recording');
+      const d = { socket, stream, ctx, base: input.trim() ? `${input.trimEnd()} ` : '', finalized: '' };
+      dictRef.current = d;
+      const render = (partial: string) => setInput(`${d.base}${d.finalized}${partial}`);
+      socket.on('dictation:partial', ({ text }: { text: string }) => render(text));
+      socket.on('dictation:final', ({ text }: { text: string }) => {
+        if (text) d.finalized += text;
+        render('');
+      });
+      socket.on('dictation:error', ({ message }: { message: string }) => setVoiceErr(message));
+      socket.on('connect_error', () => {
+        setVoiceErr('听写连接失败');
+        stopVoice();
+      });
+      setVoice('live');
+      taRef.current?.focus();
    } catch {
      setVoiceErr('无法访问麦克风(请检查浏览器授权)');
-      setVoice('idle');
+      stopVoice();
    }
  };
@@ -579,18 +628,15 @@ export function AssistantChat() {
            <button
              type="button"
              onClick={toggleVoice}
-              disabled={voice === 'busy'}
+              title={voice === 'live' ? '结束听写' : '语音输入(边说边出字)'}
-              title={voice === 'recording' ? '停止录音' : '语音输入(点击说话)'}
              className={cn(
                'mb-0.5 inline-flex h-8 w-8 flex-none items-center justify-center rounded-lg transition-colors',
-                voice === 'recording'
+                voice === 'live'
                  ? 'animate-pulse bg-rose-100 text-rose-600 ring-1 ring-inset ring-rose-300'
-                  : voice === 'busy'
-                    ? 'bg-slate-100 text-slate-400'
                  : 'bg-slate-100 text-slate-500 hover:bg-teal-50 hover:text-teal-700',
              )}
            >
-              {voice === 'busy' ? <Loader2 className="h-4 w-4 animate-spin" /> : <Mic className="h-4 w-4" />}
+              <Mic className="h-4 w-4" />
            </button>
            {status === 'streaming' ? (
              <button
@@ -619,8 +665,8 @@ export function AssistantChat() {
          <p className="mt-1.5 text-center text-[10.5px] text-slate-400">
            {voiceErr ? (
              <span className="text-rose-500">{voiceErr}</span>
-            ) : voice === 'recording' ? (
+            ) : voice === 'live' ? (
-              <span className="text-rose-500">● 录音中… 再点一下麦克风结束</span>
+              <span className="text-rose-500">● 听写中,边说边出字… 再点麦克风退出</span>
            ) : (
              '结果仅供参考 请核对后使用'
            )}