New modules shipped (from Linnet integration): - acoustic.py: AST (MIT/ast-finetuned-audioset-10-10-0.4593) replaces YAMNet stub; 527 AudioSet classes mapped to queue/speaker/environ/scene labels; _LABEL_MAP includes hold_music, ringback, DTMF, background_shift, AMD signal chain - accent.py: facebook/mms-lid-126 language ID → regional accent labels (en_gb, en_us, en_au, fr, es, de, zh, …); lazy-loaded, gated by CF_VOICE_ACCENT - privacy.py: compound privacy risk scorer — public_env, background_voices, nature scene, accent signals; returns 0–3 score without storing any audio - prosody.py: openSMILE-backed prosody extractor (sarcasm_risk, flat_f0_score, speech_rate, pitch_range); mock mode returns neutral values - dimensional.py: audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim valence/arousal/dominance scorer; gated by CF_VOICE_DIMENSIONAL - trajectory.py: rolling buffer for arousal/valence deltas, trend detection (escalating/suppressed/stable), coherence scoring, suppression/reframe flags - telephony.py: TelephonyBackend Protocol + MockTelephonyBackend + SignalWireBackend + FreeSWITCHBackend; CallSession dataclass; make_telephony() factory - app.py: FastAPI service (port 8007) — /health + /classify; accepts base64 PCM chunks, returns full AudioEventOut including dimensional/prosody/accent fields - prefs.py: voice preference helpers (elcor_mode, confidence_threshold, whisper_model, elcor_prior_frames); cf-core and env-var fallback Tests: fix stale tests (YAMNetAcousticBackend → ASTAcousticBackend, scene field added to AcousticResult, speaker_at gap now resolves dominant speaker not UNKNOWN, make_io real path returns MicVoiceIO when sounddevice installed). 78 tests passing. Closes #2, #3.
115 lines
4.6 KiB
Python
115 lines
4.6 KiB
Python
# cf_voice/privacy.py — local acoustic privacy risk scoring
|
|
#
|
|
# MIT licensed. Never transmitted to cloud. Never logged server-side.
|
|
#
|
|
# Derives a privacy_risk level (low / moderate / high) from the combined
|
|
# acoustic fingerprint: scene + environ labels + speaker type + accent.
|
|
#
|
|
# Design rationale (#20):
|
|
# - "outdoor_urban" + "crowd_chatter" + "traffic" → low: clearly public
|
|
# - "indoor_quiet" + "background_voices" → moderate: conversation overheard
|
|
# - "outdoor_nature" + "birdsong" + regional accent → moderate-high: location-identifying compound
|
|
# - "indoor_quiet" + no background voices → low
|
|
#
|
|
# Risk gates (Linnet):
|
|
# high: warn before sending audio chunk to cloud STT; offer local-only fallback
|
|
# moderate: attach privacy_flags to session state, no blocking action
|
|
# low: proceed normally
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Literal
|
|
|
|
PrivacyLevel = Literal["low", "moderate", "high"]
|
|
|
|
|
|
@dataclass
|
|
class PrivacyRisk:
|
|
"""
|
|
Locally-computed privacy risk for a single audio window.
|
|
|
|
level: aggregate risk level
|
|
flags: ordered list of contributing signal descriptions
|
|
"""
|
|
level: PrivacyLevel
|
|
flags: list[str] = field(default_factory=list)
|
|
|
|
|
|
# ── Signal sets ───────────────────────────────────────────────────────────────
|
|
|
|
_PUBLIC_SCENES = {"outdoor_urban", "public_transit"}
|
|
_NATURE_SCENES = {"outdoor_nature"}
|
|
_QUIET_SCENES = {"indoor_quiet"}
|
|
|
|
_LOCATION_ENVIRON = {"birdsong", "wind", "rain", "water"}
|
|
_URBAN_ENVIRON = {"traffic", "crowd_chatter", "street_signal", "construction"}
|
|
|
|
|
|
def score_privacy_risk(
|
|
scene: str | None,
|
|
environ_labels: list[str],
|
|
speaker: str | None,
|
|
accent: str | None,
|
|
) -> PrivacyRisk:
|
|
"""
|
|
Derive a PrivacyRisk from the current acoustic fingerprint.
|
|
|
|
All inputs are nullable — this function handles partial signals gracefully.
|
|
Called per audio window; results are never persisted or transmitted.
|
|
|
|
Args:
|
|
scene: SCENE_LABEL string or None
|
|
environ_labels: list of ENVIRON_LABEL strings active in this window
|
|
speaker: SPEAKER_LABEL string or None
|
|
accent: ACCENT_LABEL string or None (None when CF_VOICE_ACCENT disabled)
|
|
"""
|
|
flags: list[str] = []
|
|
score = 0 # internal accumulator; maps to level at the end
|
|
|
|
environ_set = set(environ_labels)
|
|
|
|
# ── Clearly public environments → reduce risk ─────────────────────────────
|
|
if scene in _PUBLIC_SCENES or environ_set & _URBAN_ENVIRON:
|
|
flags.append("public_environment")
|
|
score -= 1
|
|
|
|
# ── Background voices: conversation may be overheard ─────────────────────
|
|
if speaker == "background_voices":
|
|
flags.append("background_voices_detected")
|
|
score += 2
|
|
|
|
# ── Quiet indoor: no background noise reduces identifiability ────────────
|
|
if scene in _QUIET_SCENES and speaker not in ("background_voices", "human_multi"):
|
|
flags.append("controlled_environment")
|
|
# No score change — neutral
|
|
|
|
# ── Nature sounds: alone they suggest a quiet, potentially identifiable location
|
|
nature_match = environ_set & _LOCATION_ENVIRON
|
|
if nature_match:
|
|
flags.append(f"location_signal: {', '.join(sorted(nature_match))}")
|
|
score += 1
|
|
|
|
# ── Nature scene + nature sounds: compound location-identifying signal ────
|
|
if scene in _NATURE_SCENES and nature_match:
|
|
flags.append("compound_location_signal")
|
|
score += 1
|
|
|
|
# ── Regional accent + nature: narrows location to region + environment ────
|
|
if accent and accent not in ("en_us", "other") and nature_match:
|
|
flags.append(f"accent_plus_location: {accent}")
|
|
score += 1
|
|
|
|
# ── Quiet indoor + background voices: overheard conversation ─────────────
|
|
if scene in _QUIET_SCENES and speaker == "background_voices":
|
|
flags.append("overheard_conversation")
|
|
score += 1
|
|
|
|
# ── Map score to level ────────────────────────────────────────────────────
|
|
if score <= 0:
|
|
level: PrivacyLevel = "low"
|
|
elif score <= 2:
|
|
level = "moderate"
|
|
else:
|
|
level = "high"
|
|
|
|
return PrivacyRisk(level=level, flags=flags)
|