turnstone/app/tasks/anomaly_scorer.py

"""Background anomaly scoring task.

Runs score_unscored() after each glean cycle (triggered by glean_scheduler)
or on its own interval when TURNSTONE_ANOMALY_INTERVAL is set.

Set TURNSTONE_ANOMALY_MODEL to a HuggingFace model ID to activate.
When the env var is empty (default) the scorer is a no-op.
"""
from __future__ import annotations

import asyncio
import logging
import os
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from pathlib import Path

from app.services.anomaly import ScoringResult, score_unscored

logger = logging.getLogger(__name__)

_DEFAULT_INTERVAL = int(os.environ.get("TURNSTONE_ANOMALY_INTERVAL", "0"))

_lock = asyncio.Lock()


@dataclass
class ScorerState:
    last_run_at: str | None = None
    last_duration_s: float | None = None
    last_scored: int = 0
    last_detections: int = 0
    last_error: str | None = None
    run_count: int = 0
    next_run_at: str | None = None
    running: bool = False
    total_scored: int = 0
    total_detections: int = 0


_state = ScorerState()


def get_state() -> ScorerState:
    return _state


async def run_once(
    db_path: Path,
    model_id: str = "",
    device: str = "cpu",
    batch_size: int = 256,
    threshold: float = 0.75,
) -> ScoringResult:
    """Score unscored entries once. Skips if already running or model not configured."""
    if _lock.locked():
        return ScoringResult(skipped=True, error="scorer already running")

    async with _lock:
        _state.running = True
        started = datetime.now(tz=timezone.utc)
        try:
            loop = asyncio.get_running_loop()
            result: ScoringResult = await loop.run_in_executor(
                None,
                lambda: score_unscored(db_path, model_id, device, batch_size, threshold),
            )
            duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at = started.isoformat()
            _state.last_duration_s = round(duration, 2)
            _state.last_scored = result.scored
            _state.last_detections = result.detections
            _state.last_error = result.error
            _state.run_count += 1
            _state.total_scored += result.scored
            _state.total_detections += result.detections
            if not result.skipped:
                logger.info(
                    "Anomaly scorer: %d scored, %d detections in %.1fs",
                    result.scored, result.detections, duration,
                )
            return result
        except Exception as exc:
            duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at = started.isoformat()
            _state.last_duration_s = round(duration, 2)
            _state.last_error = str(exc)
            _state.run_count += 1
            logger.error("Anomaly scorer failed: %s", exc)
            return ScoringResult(error=str(exc))
        finally:
            _state.running = False


async def scorer_loop(
    db_path: Path,
    model_id: str,
    device: str,
    interval_s: int,
    batch_size: int = 256,
    threshold: float = 0.75,
) -> None:
    """Score unscored entries every interval_s seconds until cancelled."""
    logger.info("Anomaly scorer loop started — interval %ds, model: %s", interval_s, model_id)
    while True:
        await run_once(db_path, model_id, device, batch_size, threshold)
        next_run = datetime.now(tz=timezone.utc) + timedelta(seconds=interval_s)
        _state.next_run_at = next_run.isoformat()
        try:
            await asyncio.sleep(interval_s)
        except asyncio.CancelledError:
            logger.info("Anomaly scorer loop cancelled")
            _state.next_run_at = None
            raise