linnet/app/api/audio.py

# app/api/audio.py — WebSocket audio ingestion endpoint
#
# Receives raw PCM Int16 audio chunks from the browser's AudioWorkletProcessor.
# Each message is a binary frame: 16kHz mono Int16 PCM.
# The backend accumulates chunks until cf-voice processes them.
#
# Notation v0.1.x: audio is accepted and acknowledged but inference runs
# through the background ContextClassifier (started at session creation),
# not inline here. This endpoint is wired for the real audio path
# (Navigation v0.2.x) where chunks feed the STT + diarizer directly.
from __future__ import annotations

import logging

from fastapi import APIRouter, WebSocket, WebSocketDisconnect

from app.services import session_store

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/session", tags=["audio"])


@router.websocket("/{session_id}/audio")
async def audio_ws(websocket: WebSocket, session_id: str) -> None:
    """
    WebSocket endpoint for binary PCM audio upload.

    Clients (browser AudioWorkletProcessor) send binary frames.
    Server acknowledges each frame with {"ok": true}.

    In mock mode (CF_VOICE_MOCK=1) the session's ContextClassifier generates
    synthetic frames independently -- audio sent here is accepted but not
    processed. Real inference wiring happens in Navigation v0.2.x.
    """
    session = session_store.get_session(session_id)
    if session is None:
        await websocket.close(code=4004, reason=f"Session {session_id} not found")
        return

    await websocket.accept()
    logger.info("Audio WS connected for session %s", session_id)

    try:
        while True:
            data = await websocket.receive_bytes()
            # Notation v0.1.x: acknowledge receipt; real inference in v0.2.x
            await websocket.send_json({"ok": True, "bytes": len(data)})
    except WebSocketDisconnect:
        logger.info("Audio WS disconnected for session %s", session_id)