const { useEffect, useMemo, useRef, useState } = React; const STOP_CLOSE_TIMEOUT_MS = 150; const FILE_STOP_CLOSE_TIMEOUT_MS = 10000; const WS_KEEPALIVE_INTERVAL_MS = 15000; const WS_AUTO_RECONNECT_DELAY_MS = 250; const AUTO_SCROLL_THRESHOLD_PX = 28; const FILE_STREAM_CHUNK_SAMPLES = 512; const FILE_SAMPLE_RATE = 16000; const FILE_TAIL_PAD_FRAMES = 0; const FILE_TAIL_PAD_FRAME_MS = 10; const FILE_TAIL_PAD_SAMPLES = Math.round((FILE_SAMPLE_RATE * FILE_TAIL_PAD_FRAMES * FILE_TAIL_PAD_FRAME_MS) / 1000); const FILE_TAIL_PAD_SETTLE_MS = 0; const FILE_STREAM_SPEED_MULTIPLIER = 4; const WAVEFORM_POINT_COUNT = 128; const WAVEFORM_POINTS_PER_CHUNK = 10; const WAVEFORM_VISUAL_GAIN = 10; const LANGUAGE_LABELS = { zh: "Chinese", en: "English", ja: "Japanese", ko: "Korean", yue: "Cantonese", ar: "Arabic", de: "German", fr: "French", es: "Spanish", pt: "Portuguese", id: "Indonesian", it: "Italian", ru: "Russian", th: "Thai", vi: "Vietnamese", tr: "Turkish", hi: "Hindi", ms: "Malay", nl: "Dutch", sv: "Swedish", da: "Danish", fi: "Finnish", pl: "Polish", cs: "Czech", fil: "Filipino", fa: "Persian", el: "Greek", hu: "Hungarian", mk: "Macedonian", ro: "Romanian", }; const MODE_OPTIONS = [ { value: "demo", label: "Demo" }, { value: "default", label: "Default" }, { value: "vertical", label: "Vertical" }, { value: "grid", label: "Grid" }, ]; const DEMO_PROVIDER_NAME = "zipformer_new_016s_punctuation_beam"; const DEMO_PROVIDER_NAMES = [DEMO_PROVIDER_NAME]; const INPUT_MODE_OPTIONS = [ { value: "microphone", label: "Microphone" }, { value: "file", label: "Static Resource" }, ]; function makeSilentWaveform() { return Array.from({ length: WAVEFORM_POINT_COUNT }, () => 0); } function sampleWaveformPoints(samples) { if (!samples || !samples.length) return []; const segmentSize = Math.max(1, Math.floor(samples.length / WAVEFORM_POINTS_PER_CHUNK)); const points = []; for (let start = 0; start < samples.length; start += segmentSize) { let signedPeak = 0; for (let i = start; i < Math.min(samples.length, start + segmentSize); i += 1) { const value = samples[i]; if (Math.abs(value) > Math.abs(signedPeak)) { signedPeak = value; } } points.push(Math.max(-1, Math.min(1, signedPeak * WAVEFORM_VISUAL_GAIN))); if (points.length >= WAVEFORM_POINTS_PER_CHUNK) break; } return points; } function makeProviderMetaMap(providerCatalog) { return Object.fromEntries(providerCatalog.map((item) => [item.name, item])); } function makeProviderState(name, providerMetaMap) { const meta = providerMetaMap[name] || { label: name, detail: "", supported_languages: [] }; return { name, label: meta.label, detail: meta.detail, supportedLanguages: Array.isArray(meta.supported_languages) ? meta.supported_languages : [], partial: "...", committedText: "", }; } function getDisplayText(provider) { const stable = String(provider.committedText || "").trim(); const live = normalizeLiveText(provider.partial); return joinSessionText(stable, live); } function normalizeLiveText(text) { const value = String(text || "").trim(); if (!value || value === "...") return ""; return value; } function makeInitialProviders(providerCatalog) { const providerMetaMap = makeProviderMetaMap(providerCatalog); return Object.fromEntries(providerCatalog.map((item) => [item.name, makeProviderState(item.name, providerMetaMap)])); } function PlaybackIcon({ state }) { if (state === "playing") { return ( ); } return ( ); } function AudioWaveform({ samples }) { const canvasRef = useRef(null); useEffect(() => { const canvas = canvasRef.current; if (!canvas) return; const rect = canvas.getBoundingClientRect(); const cssWidth = Math.max(1, rect.width || canvas.clientWidth || 900); const cssHeight = Math.max(1, rect.height || canvas.clientHeight || 104); const dpr = window.devicePixelRatio || 1; const width = Math.floor(cssWidth * dpr); const height = Math.floor(cssHeight * dpr); if (canvas.width !== width || canvas.height !== height) { canvas.width = width; canvas.height = height; } const ctx = canvas.getContext("2d"); ctx.setTransform(dpr, 0, 0, dpr, 0, 0); ctx.clearRect(0, 0, cssWidth, cssHeight); const centerY = cssHeight / 2; const waveform = samples && samples.length ? samples : makeSilentWaveform(); const glow = ctx.createLinearGradient(0, 0, cssWidth, 0); glow.addColorStop(0, "rgba(47, 211, 255, 0.08)"); glow.addColorStop(0.5, "rgba(142, 247, 255, 0.24)"); glow.addColorStop(1, "rgba(47, 211, 255, 0.08)"); ctx.fillStyle = glow; ctx.fillRect(0, centerY - 1, cssWidth, 2); ctx.beginPath(); waveform.forEach((value, index) => { const x = waveform.length <= 1 ? 0 : (index / (waveform.length - 1)) * cssWidth; const y = centerY - Math.max(-1, Math.min(1, value)) * centerY * 0.78; if (index === 0) { ctx.moveTo(x, y); } else { ctx.lineTo(x, y); } }); ctx.lineWidth = 2.4; ctx.lineJoin = "round"; ctx.lineCap = "round"; ctx.shadowColor = "rgba(142, 247, 255, 0.7)"; ctx.shadowBlur = 14; ctx.strokeStyle = "rgba(142, 247, 255, 0.96)"; ctx.stroke(); ctx.shadowBlur = 0; ctx.beginPath(); waveform.forEach((value, index) => { const x = waveform.length <= 1 ? 0 : (index / (waveform.length - 1)) * cssWidth; const y = centerY - Math.max(-1, Math.min(1, value)) * centerY * 0.78; if (index === 0) { ctx.moveTo(x, y); } else { ctx.lineTo(x, y); } }); ctx.lineWidth = 1; ctx.strokeStyle = "rgba(230, 240, 255, 0.85)"; ctx.stroke(); }, [samples]); return (
); } function SpeakerIcon({ muted }) { return ( ); } function isSupportedAudioFile(file) { if (!file) return false; if (String(file.type || "").startsWith("audio/")) return true; return /\.(wav|mp3|m4a|aac|flac|ogg|webm)$/i.test(String(file.name || "")); } function TranscriptBox({ text, className = "" }) { const boxRef = useRef(null); const followTailRef = useRef(true); useEffect(() => { const element = boxRef.current; if (!element) return; if (followTailRef.current) { element.scrollTop = element.scrollHeight; } }, [text]); function handleScroll() { const element = boxRef.current; if (!element) return; const distanceFromBottom = element.scrollHeight - element.scrollTop - element.clientHeight; followTailRef.current = distanceFromBottom <= AUTO_SCROLL_THRESHOLD_PX; } const boxClassName = ["text-box", "transcript-box", className].filter(Boolean).join(" "); return (
{text || "..."}
); } function App() { const [status, setStatus] = useState("idle"); const [providerCatalog, setProviderCatalog] = useState([]); const [providers, setProviders] = useState(() => makeInitialProviders([])); const [selectedLanguage, setSelectedLanguage] = useState("zh"); const [selectedMode, setSelectedMode] = useState("default"); const [selectedInputMode, setSelectedInputMode] = useState("microphone"); const [selectedDefaultProviders, setSelectedDefaultProviders] = useState(["zipformer_new_016s_punctuation"]); const [draftDefaultProviders, setDraftDefaultProviders] = useState(["zipformer_new_016s_punctuation"]); const [selectedFile, setSelectedFile] = useState(null); const [isFileDragOver, setIsFileDragOver] = useState(false); const [selectedFilePlaybackMode, setSelectedFilePlaybackMode] = useState("muted"); const [filePlaybackState, setFilePlaybackState] = useState("stopped"); const [inputNotice, setInputNotice] = useState(""); const [startNeedsAttention, setStartNeedsAttention] = useState(false); const [catalogLoaded, setCatalogLoaded] = useState(false); const [sessionId, setSessionId] = useState(""); const [startedAt, setStartedAt] = useState(null); const [durationTick, setDurationTick] = useState(0); const [inputLevel, setInputLevel] = useState(0); const [waveformSamples, setWaveformSamples] = useState(() => makeSilentWaveform()); const wsRef = useRef(null); const mediaStreamRef = useRef(null); const audioContextRef = useRef(null); const processorRef = useRef(null); const microphoneSourceRef = useRef(null); const microphoneSinkGainRef = useRef(null); const microphoneSessionWsRef = useRef(null); const providerCatalogRef = useRef([]); const fileInputRef = useRef(null); const streamAbortRef = useRef({ aborted: false }); const filePlaybackBufferRef = useRef(null); const filePlaybackFileRef = useRef(null); const filePlaybackSourceRef = useRef(null); const filePlaybackGainRef = useRef(null); const filePlaybackProcessorRef = useRef(null); const filePlaybackSessionWsRef = useRef(null); const filePlaybackOffsetRef = useRef(0); const filePlaybackStartAtRef = useRef(0); const filePlaybackDurationRef = useRef(0); const filePlaybackStopReasonRef = useRef("idle"); const activeSessionProviderKeyRef = useRef(""); const latestStartSessionRef = useRef(() => {}); const transitionInProgressRef = useRef(false); const wsKeepaliveTimerRef = useRef(null); const autoReconnectTimerRef = useRef(null); const expectedWsCloseRef = useRef(false); const inputLevelFrameRef = useRef(null); const latestInputLevelRef = useRef(0); const waveformFrameRef = useRef(null); const latestWaveformSamplesRef = useRef(makeSilentWaveform()); useEffect(() => { function handleBeforeUnload() { stopSession({ immediateClose: true }); } window.addEventListener("beforeunload", handleBeforeUnload); return () => { window.removeEventListener("beforeunload", handleBeforeUnload); stopSession({ immediateClose: true }); }; }, []); useEffect(() => { providerCatalogRef.current = providerCatalog; }, [providerCatalog]); useEffect( () => () => { if (inputLevelFrameRef.current) { cancelAnimationFrame(inputLevelFrameRef.current); } if (waveformFrameRef.current) { cancelAnimationFrame(waveformFrameRef.current); } }, [] ); useEffect(() => { if (!startedAt) { setDurationTick(0); return; } const timer = setInterval(() => setDurationTick((value) => value + 1), 1000); return () => clearInterval(timer); }, [startedAt]); useEffect(() => { let cancelled = false; async function loadProviderCatalog() { try { const response = await fetch("/api/provider-catalog", { cache: "no-store" }); if (!response.ok) { throw new Error(`failed to load provider catalog: ${response.status}`); } const payload = await response.json(); const nextCatalog = Array.isArray(payload.providers) ? payload.providers : []; if (cancelled) return; setProviderCatalog(nextCatalog); setProviders((prev) => { const providerMetaMap = makeProviderMetaMap(nextCatalog); const nextEntries = nextCatalog.map((item) => { const current = prev[item.name]; if (!current) { return [item.name, makeProviderState(item.name, providerMetaMap)]; } return [ item.name, { ...current, label: item.label || item.name, detail: item.detail || "", supportedLanguages: Array.isArray(item.supported_languages) ? item.supported_languages : [], }, ]; }); return Object.fromEntries(nextEntries); }); } catch (error) { console.error(error); } finally { if (!cancelled) { setCatalogLoaded(true); } } } loadProviderCatalog(); return () => { cancelled = true; }; }, []); const durationText = useMemo(() => { if (!startedAt) return "--"; return `${Math.max(0, Math.round((Date.now() - startedAt) / 1000))}s`; }, [startedAt, durationTick]); const availableLanguages = useMemo(() => { const values = new Set(); providerCatalog.forEach((item) => { (item.supported_languages || []).forEach((language) => { if (String(language || "").trim()) { values.add(String(language).trim()); } }); }); const ordered = Array.from(values); if (ordered.includes("zh")) { return ["zh", ...ordered.filter((language) => language !== "zh")]; } return ordered; }, [providerCatalog]); useEffect(() => { if (!availableLanguages.includes(selectedLanguage)) { setSelectedLanguage(availableLanguages[0] || "zh"); } }, [availableLanguages, selectedLanguage]); const visibleProviderCatalog = useMemo(() => { return providerCatalog.filter((item) => (item.supported_languages || []).includes(selectedLanguage)); }, [providerCatalog, selectedLanguage]); const visibleProviderMap = useMemo( () => Object.fromEntries(visibleProviderCatalog.map((item) => [item.name, item])), [visibleProviderCatalog] ); const providerCatalogMap = useMemo( () => Object.fromEntries(providerCatalog.map((item) => [item.name, item])), [providerCatalog] ); const providerOrder = useMemo( () => [ "paraformer_sherpa", "sensevoice_sherpa", "qwen3", "zipformer_new_016s_punctuation", "zipformer_new_0_5s_punctuation", "zipformer_new_1s_punctuation", "zipformer_new_2s_punctuation", "zipformer_viet_chunk16", "zipformer_new_016s_punctuation_beam", ], [] ); const visibleProviderNames = useMemo( () => providerOrder.filter((name) => Boolean(visibleProviderMap[name])), [providerOrder, visibleProviderMap] ); useEffect(() => { if (!catalogLoaded) return; const allowed = new Set(visibleProviderNames); const normalizeSelection = (selection) => selection.filter((name, index) => allowed.has(name) && selection.indexOf(name) === index); const nextSelected = normalizeSelection(selectedDefaultProviders); if (nextSelected.join("|") !== selectedDefaultProviders.join("|")) { setSelectedDefaultProviders(nextSelected); } const nextDraft = normalizeSelection(draftDefaultProviders); if (nextDraft.join("|") !== draftDefaultProviders.join("|")) { setDraftDefaultProviders(nextDraft); } }, [catalogLoaded, visibleProviderNames, selectedDefaultProviders, draftDefaultProviders]); const gridRows = useMemo( () => [ { className: "provider-grid provider-grid--three", names: ["paraformer_sherpa", "sensevoice_sherpa", "qwen3"], }, { className: "provider-grid provider-grid--four", names: [ "zipformer_new_016s_punctuation", "zipformer_new_0_5s_punctuation", "zipformer_new_1s_punctuation", "zipformer_new_2s_punctuation", ], }, { className: "provider-grid provider-grid--four", names: ["zipformer_viet_chunk16", "zipformer_new_016s_punctuation_beam"], }, ], [] ); const activeSessionProviderNames = useMemo(() => { if (selectedMode === "demo") { return providerCatalogMap[DEMO_PROVIDER_NAME] ? DEMO_PROVIDER_NAMES : []; } if (selectedMode === "default") { return selectedDefaultProviders.filter((name) => Boolean(visibleProviderMap[name])); } return visibleProviderNames; }, [selectedMode, providerCatalogMap, selectedDefaultProviders, visibleProviderMap, visibleProviderNames]); const activeSessionProviderKey = useMemo( () => activeSessionProviderNames.join("|"), [activeSessionProviderNames] ); function buildVisibleProvidersForLanguage(language) { const catalogForLanguage = providerCatalog.filter((item) => (item.supported_languages || []).includes(language)); const mapForLanguage = Object.fromEntries(catalogForLanguage.map((item) => [item.name, item])); const namesForLanguage = providerOrder.filter((name) => Boolean(mapForLanguage[name])); return { mapForLanguage, namesForLanguage }; } function computeTargetProviderNames(nextLanguage, nextMode, nextDefaultProviders = selectedDefaultProviders) { const { mapForLanguage, namesForLanguage } = buildVisibleProvidersForLanguage(nextLanguage); const effectiveDefaultProviders = (nextDefaultProviders || []).filter( (name, index, array) => Boolean(mapForLanguage[name]) && array.indexOf(name) === index ); const targetNames = nextMode === "demo" ? providerCatalogMap[DEMO_PROVIDER_NAME] ? DEMO_PROVIDER_NAMES : [] : nextMode === "default" ? effectiveDefaultProviders : namesForLanguage; return { targetNames, effectiveDefaultProviders }; } function handleProviderSelectionChange({ nextLanguage = selectedLanguage, nextMode = selectedMode, nextDefaultProviders = selectedDefaultProviders, }) { const { targetNames, effectiveDefaultProviders } = computeTargetProviderNames( nextLanguage, nextMode, nextDefaultProviders ); const targetKey = targetNames.join("|"); setSelectedLanguage(nextLanguage); setSelectedMode(nextMode); if (nextMode === "demo") { setSelectedInputMode("microphone"); } setSelectedDefaultProviders(effectiveDefaultProviders); setDraftDefaultProviders(effectiveDefaultProviders); if (!wsRef.current) return; if (status !== "connecting" && status !== "streaming") return; if (!activeSessionProviderKeyRef.current) return; if (transitionInProgressRef.current) return; if (activeSessionProviderKeyRef.current === targetKey) return; const currentNames = activeSessionProviderKeyRef.current.split("|").filter(Boolean); const hasReuse = currentNames.some((name) => targetNames.includes(name)); if (hasReuse) { try { wsRef.current.send(JSON.stringify({ event: "switch_providers", providers: targetNames })); activeSessionProviderKeyRef.current = targetKey; setInputNotice(targetNames.length ? "" : "No active models are available for the current view."); setStartNeedsAttention(false); } catch (error) { console.error(error); setInputNotice("Failed to switch models cleanly. Please press Start again."); setStartNeedsAttention(true); } return; } transitionInProgressRef.current = true; activeSessionProviderKeyRef.current = ""; setSessionId(""); if (targetNames.length <= 1) { setInputNotice( targetNames.length ? "Focus model changed. Press Start to run the new model." : "No active models are available for the current view." ); setStartNeedsAttention(targetNames.length > 0); stopSession({ immediateClose: true, detachSocket: true }); transitionInProgressRef.current = false; return; } setInputNotice(""); setStartNeedsAttention(false); stopSession({ immediateClose: true, detachSocket: true }); transitionInProgressRef.current = false; window.setTimeout(() => { latestStartSessionRef.current(); }, 0); } const defaultSelectionDirty = useMemo( () => draftDefaultProviders.join("|") !== selectedDefaultProviders.join("|"), [draftDefaultProviders, selectedDefaultProviders] ); function toggleDraftDefaultProvider(name) { setDraftDefaultProviders((prev) => prev.includes(name) ? prev.filter((item) => item !== name) : [...prev, name] ); } function applyDefaultProviderSelection() { handleProviderSelectionChange({ nextDefaultProviders: draftDefaultProviders }); } function clearWsKeepalive() { if (wsKeepaliveTimerRef.current) { clearInterval(wsKeepaliveTimerRef.current); wsKeepaliveTimerRef.current = null; } } function clearAutoReconnect() { if (autoReconnectTimerRef.current) { clearTimeout(autoReconnectTimerRef.current); autoReconnectTimerRef.current = null; } } function startWsKeepalive(ws) { clearWsKeepalive(); wsKeepaliveTimerRef.current = setInterval(() => { if (wsRef.current !== ws || ws.readyState !== WebSocket.OPEN) return; try { ws.send(JSON.stringify({ event: "ping" })); } catch (error) { console.warn("failed to send ws keepalive", error); } }, WS_KEEPALIVE_INTERVAL_MS); } function getMicrophonePreflightError() { if (!window.isSecureContext) { return "Microphone mode requires HTTPS. Please use the secure domain or switch to Static Resource."; } if (!navigator.mediaDevices || !navigator.mediaDevices.getUserMedia) { return "This browser cannot access the microphone in the current context."; } return ""; } function formatInputStartError(error) { const name = String(error?.name || ""); if (name === "NotAllowedError" || name === "SecurityError") { return "Microphone permission was denied."; } if (name === "NotFoundError" || name === "DevicesNotFoundError") { return "No microphone device was found."; } if (name === "NotReadableError" || name === "TrackStartError" || name === "AbortError") { return "The microphone is busy or unavailable."; } return error?.message ? `Failed to start input stream: ${error.message}` : "Failed to start input stream."; } function updateInputLevel(nextLevel) { latestInputLevelRef.current = Math.max(0, Math.min(1, Number(nextLevel) || 0)); if (inputLevelFrameRef.current) return; inputLevelFrameRef.current = requestAnimationFrame(() => { inputLevelFrameRef.current = null; setInputLevel(latestInputLevelRef.current); }); } function updateWaveform(samples) { const points = sampleWaveformPoints(samples); if (!points.length) return; latestWaveformSamplesRef.current = latestWaveformSamplesRef.current .slice(points.length) .concat(points); if (waveformFrameRef.current) return; waveformFrameRef.current = requestAnimationFrame(() => { waveformFrameRef.current = null; setWaveformSamples(latestWaveformSamplesRef.current); }); } function resetInputLevel() { latestInputLevelRef.current = 0; if (inputLevelFrameRef.current) { cancelAnimationFrame(inputLevelFrameRef.current); inputLevelFrameRef.current = null; } if (waveformFrameRef.current) { cancelAnimationFrame(waveformFrameRef.current); waveformFrameRef.current = null; } setInputLevel(0); latestWaveformSamplesRef.current = makeSilentWaveform(); setWaveformSamples(latestWaveformSamplesRef.current); } function updateInputLevelFromSamples(samples) { if (!samples || !samples.length) { updateInputLevel(0); return { rms: 0, peak: 0 }; } let sumSq = 0; let peak = 0; for (let i = 0; i < samples.length; i += 1) { const value = samples[i]; sumSq += value * value; const abs = Math.abs(value); if (abs > peak) peak = abs; } const rms = Math.sqrt(sumSq / samples.length); updateInputLevel(Math.min(1, rms * 24 + peak * 0.35)); updateWaveform(samples); return { rms, peak }; } async function startSession(options = {}) { const { preserveTranscripts = false, preserveStartedAt = false, reconnecting = false } = options; if (transitionInProgressRef.current) return; if (wsRef.current) return; clearAutoReconnect(); expectedWsCloseRef.current = false; if (selectedInputMode === "file" && !selectedFile) { setInputNotice("Please choose an audio file first."); return; } if (!activeSessionProviderNames.length) { setInputNotice("No active models are available for the current view."); return; } const micPreflightError = selectedInputMode === "file" ? "" : getMicrophonePreflightError(); if (micPreflightError) { setInputNotice(micPreflightError); setStartNeedsAttention(true); setStatus("idle"); setStartedAt(null); return; } if (selectedInputMode !== "file") { try { await prepareMicrophone(); } catch (error) { console.error(error); stopMicrophoneOnly(); setInputNotice(formatInputStartError(error)); setStartNeedsAttention(true); setStatus("idle"); setStartedAt(null); return; } } activeSessionProviderKeyRef.current = activeSessionProviderKey; setInputNotice(reconnecting ? "Connection refreshed. Reconnecting..." : ""); setStartNeedsAttention(false); setStatus("connecting"); setSessionId(""); if (!preserveStartedAt) { setStartedAt(Date.now()); } if (!preserveTranscripts) { setProviders(makeInitialProviders(providerCatalog)); } streamAbortRef.current = { aborted: false }; if (selectedInputMode === "file") { try { await prepareFilePlaybackEngine(selectedFile); } catch (error) { console.warn("failed to prepare file playback", error); setInputNotice("Playback unavailable, continuing without local audio."); } } const protocol = location.protocol === "https:" ? "wss" : "ws"; const sessionQuery = new URLSearchParams(); sessionQuery.set("providers", activeSessionProviderNames.join(",")); const sessionUrl = `${protocol}://${location.host}/ws?${sessionQuery.toString()}`; const ws = new WebSocket(sessionUrl); ws.binaryType = "arraybuffer"; ws.onopen = async () => { startWsKeepalive(ws); try { if (selectedInputMode === "file") { setStatus("streaming"); await startFileStream(ws, selectedFile, streamAbortRef.current, selectedFilePlaybackMode); } else { startMicrophone(ws); setStatus("streaming"); } } catch (error) { console.error(error); destroyFilePlayback(); setInputNotice(formatInputStartError(error)); setStartNeedsAttention(true); if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) { ws.close(1011, "input-start-failed"); } } }; ws.onmessage = (event) => { if (wsRef.current !== ws) return; if (typeof event.data !== "string") return; const payload = JSON.parse(event.data); if (payload.action === "session_ready") { setSessionId(payload.data.session_id || ""); const activeProviders = Array.isArray(payload.data.providers) ? payload.data.providers : []; activeSessionProviderKeyRef.current = activeProviders.join("|"); if (activeProviders.length) { setProviders((prev) => { const providerMetaMap = makeProviderMetaMap(providerCatalogRef.current); const next = { ...prev }; activeProviders.forEach((name) => { if (!next[name]) { next[name] = makeProviderState(name, providerMetaMap); } }); return next; }); } } if (payload.action === "providers_updated") { const activeProviders = Array.isArray(payload.data.providers) ? payload.data.providers : []; activeSessionProviderKeyRef.current = activeProviders.join("|"); } if (payload.action === "asr_update") { const data = payload.data || {}; const provider = String(data.provider || ""); setProviders((prev) => { const providerMetaMap = makeProviderMetaMap(providerCatalogRef.current); const current = prev[provider] || makeProviderState(provider, providerMetaMap); const text = String(data.text || ""); const nextPartial = data.type === "partial" ? text.trim() ? text : current.partial : current.partial; const next = { ...current, partial: nextPartial, committedText: data.type === "final" || data.type === "stable" ? joinSessionText(current.committedText, text) : current.committedText, }; if (data.type === "final" || data.type === "stable") { next.partial = "..."; } if (data.type === "blank") { next.partial = text.trim() ? text : current.partial; } return { ...prev, [provider]: next }; }); } if (payload.action === "provider_error") { const data = payload.data || {}; const provider = String(data.provider || ""); setProviders((prev) => { const providerMetaMap = makeProviderMetaMap(providerCatalogRef.current); const current = prev[provider] || makeProviderState(provider, providerMetaMap); return { ...prev, [provider]: { ...current, partial: data.message ? `[error] ${String(data.message)}` : "[error]", }, }; }); } }; ws.onclose = () => { if (wsRef.current !== ws) return; const wasExpected = expectedWsCloseRef.current; clearWsKeepalive(); wsRef.current = null; activeSessionProviderKeyRef.current = ""; setSessionId(""); transitionInProgressRef.current = false; const shouldAutoReconnect = !wasExpected && selectedInputMode === "microphone" && activeSessionProviderNames.length > 0; if (shouldAutoReconnect) { setStatus("connecting"); clearAutoReconnect(); autoReconnectTimerRef.current = window.setTimeout(() => { latestStartSessionRef.current({ preserveTranscripts: true, preserveStartedAt: true, reconnecting: true, }); }, WS_AUTO_RECONNECT_DELAY_MS); return; } destroyFilePlayback(); setStatus("idle"); setStartedAt(null); }; wsRef.current = ws; } useEffect(() => { latestStartSessionRef.current = startSession; }); async function prepareMicrophone() { if (mediaStreamRef.current && audioContextRef.current && processorRef.current) { if (audioContextRef.current.state === "suspended") { await audioContextRef.current.resume(); } return; } const stream = await navigator.mediaDevices.getUserMedia({ audio: { channelCount: 1, echoCancellation: false, autoGainControl: false, noiseSuppression: false, }, }); mediaStreamRef.current = stream; const audioContext = new AudioContext({ sampleRate: 16000 }); audioContextRef.current = audioContext; if (audioContext.state === "suspended") { await audioContext.resume(); } const source = audioContext.createMediaStreamSource(stream); const processor = audioContext.createScriptProcessor(512, 1, 1); const sinkGain = audioContext.createGain(); sinkGain.gain.setValueAtTime(0, audioContext.currentTime); window.__codexMicDebug = window.__codexMicDebug || { chunkCount: 0, firstChunkAt: null }; processor.onaudioprocess = (event) => { const ws = microphoneSessionWsRef.current; if (!ws || ws.readyState !== WebSocket.OPEN) return; const channelCount = Math.max(1, event.inputBuffer.numberOfChannels || 1); const frameCount = event.inputBuffer.length || 0; const mixed = new Float32Array(frameCount); for (let channelIndex = 0; channelIndex < channelCount; channelIndex += 1) { const channel = event.inputBuffer.getChannelData(channelIndex); for (let i = 0; i < frameCount; i += 1) { mixed[i] += channel[i] / channelCount; } } const { rms, peak } = updateInputLevelFromSamples(mixed); const pcm = float32ToInt16(mixed); ws.send(pcm.buffer); const debug = window.__codexMicDebug || (window.__codexMicDebug = { chunkCount: 0, firstChunkAt: null, firstChunkRms: 0, firstChunkPeak: 0, channelCount: 0 }); debug.chunkCount += 1; if (!debug.firstChunkAt) { debug.firstChunkAt = Date.now(); debug.firstChunkRms = Number(rms.toFixed(6)); debug.firstChunkPeak = Number(peak.toFixed(6)); debug.channelCount = channelCount; console.info('microphone first chunk sent', pcm.length, 'channels', channelCount, 'rms', debug.firstChunkRms, 'peak', debug.firstChunkPeak); } }; source.connect(processor); processor.connect(sinkGain); sinkGain.connect(audioContext.destination); microphoneSourceRef.current = source; microphoneSinkGainRef.current = sinkGain; processorRef.current = processor; } function stopMicrophoneOnly() { microphoneSessionWsRef.current = null; resetInputLevel(); if (processorRef.current) { processorRef.current.disconnect(); processorRef.current.onaudioprocess = null; processorRef.current = null; } if (microphoneSinkGainRef.current) { try { microphoneSinkGainRef.current.disconnect(); } catch (error) { console.warn("failed to disconnect microphone sink", error); } microphoneSinkGainRef.current = null; } if (microphoneSourceRef.current) { try { microphoneSourceRef.current.disconnect(); } catch (error) { console.warn("failed to disconnect microphone source", error); } microphoneSourceRef.current = null; } if (mediaStreamRef.current) { mediaStreamRef.current.getTracks().forEach((track) => track.stop()); mediaStreamRef.current = null; } if (audioContextRef.current) { audioContextRef.current.close(); audioContextRef.current = null; } } function startMicrophone(ws) { const processor = processorRef.current; const audioContext = audioContextRef.current; if (!processor || !audioContext) { throw new Error("Microphone processor is not ready."); } microphoneSessionWsRef.current = ws; if (audioContext.state === "suspended") { audioContext.resume().catch((error) => { console.warn("failed to resume microphone audio context", error); }); } } async function startFileStream(ws, file, abortState, playbackMode) { await prepareFilePlaybackEngine(file); if (abortState.aborted || ws.readyState !== WebSocket.OPEN) return; filePlaybackSessionWsRef.current = ws; applyFilePlaybackMode(playbackMode); await startFilePlaybackSource({ restart: true }); } function applyFilePlaybackMode(mode) { const gain = filePlaybackGainRef.current; if (!gain) return; const muted = mode === "muted"; gain.gain.setValueAtTime(muted ? 0 : 1, gain.context.currentTime); } async function prepareFilePlaybackEngine(file) { if (!file) return; let audioContext = audioContextRef.current; if (!audioContext || audioContext.state === "closed") { audioContext = new AudioContext({ sampleRate: 16000 }); audioContextRef.current = audioContext; } if (audioContext.state === "suspended") { await audioContext.resume(); } const sameFile = filePlaybackFileRef.current === file && filePlaybackBufferRef.current && filePlaybackProcessorRef.current && filePlaybackGainRef.current; if (sameFile) { applyFilePlaybackMode(selectedFilePlaybackMode); return; } destroyFilePlayback({ closeContext: false, preserveFileSelection: true }); const mono16k = await decodeFileTo16kMono(file); filePlaybackBufferRef.current = mono16k; filePlaybackFileRef.current = file; filePlaybackDurationRef.current = mono16k.length / 16000; filePlaybackOffsetRef.current = 0; const gain = audioContext.createGain(); gain.connect(audioContext.destination); filePlaybackGainRef.current = gain; const processor = audioContext.createScriptProcessor(FILE_STREAM_CHUNK_SAMPLES, 1, 1); processor.onaudioprocess = (event) => { const input = event.inputBuffer.getChannelData(0); const output = event.outputBuffer.getChannelData(0); if (input && output) { output.set(input); } const ws = filePlaybackSessionWsRef.current; if (!ws || ws.readyState !== WebSocket.OPEN || !input || input.length === 0) return; updateInputLevelFromSamples(input); const pcm = float32ToInt16(input); ws.send(pcm.buffer); }; processor.connect(gain); filePlaybackProcessorRef.current = processor; applyFilePlaybackMode(selectedFilePlaybackMode); } function getCurrentFilePlaybackOffset() { const audioContext = audioContextRef.current; if (!audioContext || !filePlaybackSourceRef.current) { return filePlaybackOffsetRef.current; } const elapsed = Math.max(0, audioContext.currentTime - filePlaybackStartAtRef.current); return Math.min(filePlaybackDurationRef.current, filePlaybackOffsetRef.current + elapsed); } function stopCurrentFileSource(reason) { const source = filePlaybackSourceRef.current; if (!source) return; if (reason === "pause") { filePlaybackOffsetRef.current = getCurrentFilePlaybackOffset(); } else if (reason === "destroy") { filePlaybackOffsetRef.current = 0; } filePlaybackStopReasonRef.current = reason; filePlaybackSourceRef.current = null; try { source.stop(); } catch (error) { console.warn("failed to stop file source", error); } try { source.disconnect(); } catch (error) { console.warn("failed to disconnect file source", error); } } async function startFilePlaybackSource(options = {}) { const { restart = false } = options; const audioContext = audioContextRef.current; const samples = filePlaybackBufferRef.current; if (!audioContext || !samples) return; if (audioContext.state === "suspended") { await audioContext.resume(); } if (filePlaybackSourceRef.current) { stopCurrentFileSource("destroy"); } const audioBuffer = audioContext.createBuffer(1, samples.length, 16000); audioBuffer.copyToChannel(samples, 0); const source = audioContext.createBufferSource(); source.buffer = audioBuffer; source.connect(filePlaybackProcessorRef.current); const offset = restart ? 0 : filePlaybackOffsetRef.current; filePlaybackOffsetRef.current = offset; filePlaybackStartAtRef.current = audioContext.currentTime; filePlaybackStopReasonRef.current = "natural"; filePlaybackSourceRef.current = source; source.onended = () => { if (filePlaybackSourceRef.current === source) { filePlaybackSourceRef.current = null; } const reason = filePlaybackStopReasonRef.current; if (reason === "pause") { setFilePlaybackState("paused"); return; } if (reason === "destroy") { setFilePlaybackState("stopped"); return; } filePlaybackOffsetRef.current = filePlaybackDurationRef.current; setFilePlaybackState("stopped"); const ws = filePlaybackSessionWsRef.current; if (ws && ws.readyState === WebSocket.OPEN) { filePlaybackSessionWsRef.current = null; setStatus("stopping"); try { sendFileTailPadding(ws); } catch (error) { console.warn("failed to send tail padding after file playback", error); } window.setTimeout(() => { if (ws.readyState !== WebSocket.OPEN) return; try { ws.send(JSON.stringify({ event: "stop" })); } catch (error) { console.warn("failed to send stop event after file playback", error); } }, FILE_TAIL_PAD_SETTLE_MS); scheduleSocketClose(ws, FILE_STOP_CLOSE_TIMEOUT_MS); } }; source.start(0, offset); setFilePlaybackState("playing"); } function sendFileTailPadding(ws) { if (!ws || ws.readyState !== WebSocket.OPEN || FILE_TAIL_PAD_SAMPLES <= 0) return; let remainingSamples = FILE_TAIL_PAD_SAMPLES; while (remainingSamples > 0 && ws.readyState === WebSocket.OPEN) { const chunkSamples = Math.min(FILE_STREAM_CHUNK_SAMPLES, remainingSamples); ws.send(new Int16Array(chunkSamples).buffer); remainingSamples -= chunkSamples; } } function pauseFilePlayback() { if (!filePlaybackSourceRef.current) return; stopCurrentFileSource("pause"); } function destroyFilePlayback(options = {}) { const { closeContext = false, preserveFileSelection = false } = options; filePlaybackSessionWsRef.current = null; if (filePlaybackSourceRef.current) { stopCurrentFileSource("destroy"); } if (filePlaybackProcessorRef.current) { try { filePlaybackProcessorRef.current.disconnect(); } catch (error) { console.warn("failed to disconnect file processor", error); } filePlaybackProcessorRef.current.onaudioprocess = null; filePlaybackProcessorRef.current = null; } if (filePlaybackGainRef.current) { try { filePlaybackGainRef.current.disconnect(); } catch (error) { console.warn("failed to disconnect file gain", error); } filePlaybackGainRef.current = null; } filePlaybackBufferRef.current = null; filePlaybackFileRef.current = preserveFileSelection ? filePlaybackFileRef.current : null; filePlaybackDurationRef.current = 0; filePlaybackOffsetRef.current = 0; filePlaybackStartAtRef.current = 0; filePlaybackStopReasonRef.current = "idle"; setFilePlaybackState("stopped"); if (closeContext && audioContextRef.current) { audioContextRef.current.close(); audioContextRef.current = null; } } async function handlePlaybackToggle() { if (!selectedFile) { setInputNotice("Please choose an audio file first."); return; } try { await prepareFilePlaybackEngine(selectedFile); if (filePlaybackState === "playing") { pauseFilePlayback(); } else { await startFilePlaybackSource({ restart: filePlaybackState === "stopped" }); } setInputNotice(""); } catch (error) { console.warn("failed to toggle file playback", error); setInputNotice("Playback unavailable in this browser context."); } } function handlePlaybackModeToggle() { setSelectedFilePlaybackMode((current) => { const next = current === "playalong" ? "muted" : "playalong"; applyFilePlaybackMode(next); if (next === "muted") { setInputNotice(status === "streaming" ? "Speaker off. Recognition continues." : "Speaker off."); } else { setInputNotice(status === "streaming" ? "Speaker on." : "Speaker on."); } return next; }); } function stopSession(options = {}) { const { immediateClose = false, detachSocket = false } = options; expectedWsCloseRef.current = true; clearAutoReconnect(); if (!detachSocket) { transitionInProgressRef.current = false; } const ws = wsRef.current; streamAbortRef.current.aborted = true; if (processorRef.current) { processorRef.current.disconnect(); processorRef.current = null; } if (mediaStreamRef.current) { mediaStreamRef.current.getTracks().forEach((track) => track.stop()); mediaStreamRef.current = null; } if (audioContextRef.current) { audioContextRef.current.close(); audioContextRef.current = null; } destroyFilePlayback({ closeContext: true }); if (!ws) { clearWsKeepalive(); activeSessionProviderKeyRef.current = ""; setSessionId(""); setStatus("idle"); setStartedAt(null); return; } setStatus("stopping"); if (ws.readyState === WebSocket.OPEN) { try { ws.send(JSON.stringify({ event: "stop" })); } catch (error) { console.warn("failed to send stop event", error); } } if (immediateClose) { if (detachSocket) { clearWsKeepalive(); wsRef.current = null; filePlaybackSessionWsRef.current = null; ws.onopen = null; ws.onmessage = null; ws.onerror = null; ws.onclose = null; activeSessionProviderKeyRef.current = ""; setSessionId(""); setStatus("idle"); setStartedAt(null); } if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) { ws.close(1000, "client-stop"); } return; } scheduleSocketClose(ws, STOP_CLOSE_TIMEOUT_MS); } function handleSelectedFile(file, options = {}) { const { unsupportedMessage = "Please choose a supported local audio file." } = options; destroyFilePlayback({ closeContext: false }); setIsFileDragOver(false); if (!file) { setSelectedFile(null); setInputNotice(""); return; } if (!isSupportedAudioFile(file)) { setSelectedFile(null); setInputNotice(unsupportedMessage); return; } setSelectedFile(file); setInputNotice(`Ready: ${file.name}`); } function renderProviderCard(name, options = {}) { const { cardClassName = "", transcriptClassName = "" } = options; const provider = providers[name] || makeProviderState(name, makeProviderMetaMap(providerCatalog)); const articleClassName = ["provider-card", cardClassName].filter(Boolean).join(" "); return (

{provider.label}

{provider.detail}

Transcript

); } function renderEmpty() { return (
{catalogLoaded ? "No ASR models currently match the selected language." : "Loading ASR models..."}
); } function renderDefaultMode() { const orderedNames = selectedDefaultProviders.filter((name) => Boolean(visibleProviderMap[name])); if (!orderedNames.length) return renderEmpty(); return (
{orderedNames.map((name) => renderProviderCard(name, { cardClassName: "provider-card--focus", transcriptClassName: "text-box--focus", }) )}
); } function renderDemoMode() { if (!providerCatalogMap[DEMO_PROVIDER_NAME]) return renderEmpty(); return (
{renderProviderCard(DEMO_PROVIDER_NAME, { cardClassName: "provider-card--focus provider-card--demo", transcriptClassName: "text-box--focus text-box--demo", })}
); } function renderVerticalMode() { if (!visibleProviderNames.length) return renderEmpty(); return
{visibleProviderNames.map(renderProviderCard)}
; } function renderGridMode() { const rows = gridRows .map((row) => ({ ...row, names: row.names.filter((name) => Boolean(visibleProviderMap[name])), })) .filter((row) => row.names.length); if (!rows.length) return renderEmpty(); return rows.map((row, index) => (
{row.names.map(renderProviderCard)}
)); } let content = null; const isDemoMode = selectedMode === "demo"; if (isDemoMode) { content = renderDemoMode(); } else if (selectedMode === "vertical") { content = renderVerticalMode(); } else if (selectedMode === "grid") { content = renderGridMode(); } else { content = renderDefaultMode(); } return (

{isDemoMode ? "Focused Live Demo" : "Standalone Multi-ASR Comparison"}

{isDemoMode ? "ASR Live Demo" : "ASR Demo"}

{isDemoMode ? "X-ASR demo: A low latency streaming model" : "Browser microphone input or uploaded audio file, side-by-side comparison of multiple ASR pipelines, with language-based filtering so users can quickly focus on supported models."}

Status {status}
{!isDemoMode && (
Session {sessionId || "--"}
)}
Duration {durationText}
{!isDemoMode && ( )} {!isDemoMode && ( )} {!isDemoMode && selectedMode === "default" && (
Default Models
{visibleProviderNames.map((name) => { const selectedIndex = draftDefaultProviders.indexOf(name); return ( ); })}
)} {!isDemoMode && selectedInputMode === "file" && ( )}
{inputNotice ?
{inputNotice}
: null} {content}
); } ReactDOM.createRoot(document.getElementById("root")).render(); function joinSessionText(previous, next) { const left = String(previous || "").trim(); const right = String(next || "").trim(); if (!right) return left; if (!left) return right; if (left.endsWith(right)) return left; if (right.startsWith(left)) return right; const maxOverlap = Math.min(left.length, right.length); for (let size = maxOverlap; size > 0; size -= 1) { if (left.slice(-size) === right.slice(0, size)) { return `${left}${right.slice(size)}`.trim(); } } return `${left}${needsSpace(left, right) ? " " : ""}${right}`; } function needsSpace(left, right) { return /[A-Za-z0-9]$/.test(left) && /^[A-Za-z0-9]/.test(right); } function scheduleSocketClose(ws, timeoutMs) { window.setTimeout(() => { if (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING) { ws.close(1000, "client-stop"); } }, timeoutMs); } function sleep(ms) { return new Promise((resolve) => window.setTimeout(resolve, ms)); } function float32ToInt16(input) { const pcm = new Int16Array(input.length); for (let i = 0; i < input.length; i += 1) { const sample = Math.max(-1, Math.min(1, input[i])); pcm[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff; } return pcm; } async function decodeFileTo16kMono(file) { const AudioCtx = window.AudioContext || window.webkitAudioContext; const arrayBuffer = await file.arrayBuffer(); const decodeContext = new AudioCtx(); try { const decoded = await decodeContext.decodeAudioData(arrayBuffer.slice(0)); const monoData = mixToMono(decoded); const targetLength = Math.max(1, Math.ceil((monoData.length * 16000) / decoded.sampleRate)); const offline = new OfflineAudioContext(1, targetLength, 16000); const buffer = offline.createBuffer(1, monoData.length, decoded.sampleRate); buffer.copyToChannel(monoData, 0); const source = offline.createBufferSource(); source.buffer = buffer; source.connect(offline.destination); source.start(0); const rendered = await offline.startRendering(); return rendered.getChannelData(0).slice(); } finally { await decodeContext.close().catch(() => {}); } } function mixToMono(audioBuffer) { const { numberOfChannels, length } = audioBuffer; if (numberOfChannels === 1) { return audioBuffer.getChannelData(0); } const mono = new Float32Array(length); for (let channel = 0; channel < numberOfChannels; channel += 1) { const data = audioBuffer.getChannelData(channel); for (let i = 0; i < length; i += 1) { mono[i] += data[i] / numberOfChannels; } } return mono; }