cf-voice/cf_voice/privacy.py
pyr0ball 24f04b67db feat: full voice pipeline — AST acoustic, accent, privacy, prosody, dimensional, trajectory, telephony, FastAPI app
New modules shipped (from Linnet integration):
- acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub;
  527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP
  includes hold_music, ringback, DTMF, background_shift, AMD signal chain
- accent.py: facebook/mms-lid-126 language ID → regional accent labels
  (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT
- privacy.py: compound privacy risk scorer — public_env, background_voices,
  nature scene, accent signals; returns 0–3 score without storing any audio
- prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score,
  speech_rate, pitch_range); mock mode returns neutral values
- dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim
  valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL
- trajectory.py: rolling buffer for arousal/valence deltas, trend detection
  (escalating/suppressed/stable), coherence scoring, suppression/reframe flags
- telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend
  + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory
- app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM
  chunks, returns full AudioEventOut including dimensional/prosody/accent fields
- prefs.py: voice preference helpers (elcor_mode, confidence_threshold,
  whisper_model, elcor_prior_frames); cf-core and env-var fallback

Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field
added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN,
make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing.

Closes #2, #3.
2026-04-18 22:36:58 -07:00

115 lines
4.6 KiB
Python

# cf_voice/privacy.py — local acoustic privacy risk scoring
#
# MIT licensed. Never transmitted to cloud. Never logged server-side.
#
# Derives a privacy_risk level (low / moderate / high) from the combined
# acoustic fingerprint: scene + environ labels + speaker type + accent.
#
# Design rationale (#20):
# - "outdoor_urban" + "crowd_chatter" + "traffic" → low: clearly public
# - "indoor_quiet" + "background_voices" → moderate: conversation overheard
# - "outdoor_nature" + "birdsong" + regional accent → moderate-high: location-identifying compound
# - "indoor_quiet" + no background voices → low
#
# Risk gates (Linnet):
# high: warn before sending audio chunk to cloud STT; offer local-only fallback
# moderate: attach privacy_flags to session state, no blocking action
# low: proceed normally
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Literal
PrivacyLevel = Literal["low", "moderate", "high"]
@dataclass
class PrivacyRisk:
"""
Locally-computed privacy risk for a single audio window.
level: aggregate risk level
flags: ordered list of contributing signal descriptions
"""
level: PrivacyLevel
flags: list[str] = field(default_factory=list)
# ── Signal sets ───────────────────────────────────────────────────────────────
_PUBLIC_SCENES = {"outdoor_urban", "public_transit"}
_NATURE_SCENES = {"outdoor_nature"}
_QUIET_SCENES = {"indoor_quiet"}
_LOCATION_ENVIRON = {"birdsong", "wind", "rain", "water"}
_URBAN_ENVIRON = {"traffic", "crowd_chatter", "street_signal", "construction"}
def score_privacy_risk(
scene: str | None,
environ_labels: list[str],
speaker: str | None,
accent: str | None,
) -> PrivacyRisk:
"""
Derive a PrivacyRisk from the current acoustic fingerprint.
All inputs are nullable — this function handles partial signals gracefully.
Called per audio window; results are never persisted or transmitted.
Args:
scene: SCENE_LABEL string or None
environ_labels: list of ENVIRON_LABEL strings active in this window
speaker: SPEAKER_LABEL string or None
accent: ACCENT_LABEL string or None (None when CF_VOICE_ACCENT disabled)
"""
flags: list[str] = []
score = 0 # internal accumulator; maps to level at the end
environ_set = set(environ_labels)
# ── Clearly public environments → reduce risk ─────────────────────────────
if scene in _PUBLIC_SCENES or environ_set & _URBAN_ENVIRON:
flags.append("public_environment")
score -= 1
# ── Background voices: conversation may be overheard ─────────────────────
if speaker == "background_voices":
flags.append("background_voices_detected")
score += 2
# ── Quiet indoor: no background noise reduces identifiability ────────────
if scene in _QUIET_SCENES and speaker not in ("background_voices", "human_multi"):
flags.append("controlled_environment")
# No score change — neutral
# ── Nature sounds: alone they suggest a quiet, potentially identifiable location
nature_match = environ_set & _LOCATION_ENVIRON
if nature_match:
flags.append(f"location_signal: {', '.join(sorted(nature_match))}")
score += 1
# ── Nature scene + nature sounds: compound location-identifying signal ────
if scene in _NATURE_SCENES and nature_match:
flags.append("compound_location_signal")
score += 1
# ── Regional accent + nature: narrows location to region + environment ────
if accent and accent not in ("en_us", "other") and nature_match:
flags.append(f"accent_plus_location: {accent}")
score += 1
# ── Quiet indoor + background voices: overheard conversation ─────────────
if scene in _QUIET_SCENES and speaker == "background_voices":
flags.append("overheard_conversation")
score += 1
# ── Map score to level ────────────────────────────────────────────────────
if score <= 0:
level: PrivacyLevel = "low"
elif score <= 2:
level = "moderate"
else:
level = "high"
return PrivacyRisk(level=level, flags=flags)