cf-voice/scripts/test_classify_e2e.py

"""
End-to-end integration test for the cf-voice /classify endpoint.

Extracts a 2-second window from a local media file, base64-encodes the
raw PCM, and POSTs it to the running cf-voice service at localhost:8009.
Prints each returned AudioEvent for quick inspection.

Requires:
  - cf-voice running at localhost:8009 (CF_VOICE_DIARIZE=1 for speaker labels)
  - ffmpeg on PATH
  - A local audio/video file (edit MEDIA_FILE below)

Run:
  python scripts/test_classify_e2e.py
"""
from __future__ import annotations

import base64
import json
import subprocess
import urllib.request

import numpy as np

MEDIA_FILE = "/Library/Series/Hogan's Heroes/Season 3/Hogan's Heroes - S03E19 - Hogan, Go Home.mkv"
START_S = 120
DURATION_S = 2
SAMPLE_RATE = 16_000
CF_VOICE_URL = "http://localhost:8009"

proc = subprocess.run(
    [
        "ffmpeg", "-i", MEDIA_FILE,
        "-ss", str(START_S),
        "-t", str(DURATION_S),
        "-ar", str(SAMPLE_RATE),
        "-ac", "1",
        "-f", "s16le",
        "-",
    ],
    capture_output=True,
    check=True,
)

pcm = proc.stdout
audio = np.frombuffer(pcm, dtype=np.int16)
print(f"audio samples: {len(audio)}, duration: {len(audio) / SAMPLE_RATE:.2f}s")

payload = json.dumps({
    "audio_chunk": base64.b64encode(pcm).decode(),
    "timestamp": float(START_S),
    "session_id": "test",
}).encode()

req = urllib.request.Request(
    f"{CF_VOICE_URL}/classify",
    data=payload,
    headers={"Content-Type": "application/json"},
    method="POST",
)
with urllib.request.urlopen(req, timeout=30) as resp:
    result = json.loads(resp.read())

for ev in result["events"]:
    print(
        f"  {ev['event_type']:10}"
        f" speaker_id={ev.get('speaker_id', 'N/A'):14}"
        f" label={ev.get('label', '')}"
    )