linnet/app/api/audio.py

# app/api/audio.py — WebSocket audio ingestion endpoint
#
# Receives raw PCM Int16 audio chunks from the browser's AudioWorkletProcessor.
# Each message is a binary frame: 16kHz mono Int16 PCM.
#
# When CF_VOICE_URL is set (cf-voice sidecar allocated by cf-orch), each chunk
# is base64-encoded and forwarded to cf-voice /classify. The resulting tone
# events are broadcast to SSE subscribers via the session store.
#
# When CF_VOICE_URL is unset (local dev / mock mode), chunks are acknowledged
# but not forwarded — the in-process ContextClassifier.stream() generates
# synthetic frames independently.
from __future__ import annotations

import base64
import logging
import time

from fastapi import APIRouter, WebSocket, WebSocketDisconnect

from app.services import session_store

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/session", tags=["audio"])

_SESSION_START: dict[str, float] = {}


@router.websocket("/{session_id}/audio")
async def audio_ws(websocket: WebSocket, session_id: str) -> None:
    """
    WebSocket endpoint for binary PCM audio upload.

    Clients (browser AudioWorkletProcessor) send binary Int16 frames.
    Server acknowledges each frame with {"ok": true, "bytes": N}.

    When CF_VOICE_URL is configured, each chunk is forwarded to the cf-voice
    sidecar and the resulting tone events are broadcast to SSE subscribers.
    """
    session = session_store.get_session(session_id)
    if session is None:
        await websocket.close(code=4004, reason=f"Session {session_id} not found")
        return

    await websocket.accept()
    _SESSION_START[session_id] = time.monotonic()
    logger.info("Audio WS connected for session %s", session_id)

    try:
        while True:
            data = await websocket.receive_bytes()
            timestamp = time.monotonic() - _SESSION_START.get(session_id, 0.0)
            await websocket.send_json({"ok": True, "bytes": len(data)})
            # Forward to cf-voice sidecar (no-op if CF_VOICE_URL is unset)
            audio_b64 = base64.b64encode(data).decode()
            await session_store.forward_audio_chunk(session, audio_b64, timestamp)
    except WebSocketDisconnect:
        logger.info("Audio WS disconnected for session %s", session_id)
        _SESSION_START.pop(session_id, None)