"""Tests for app/data/log_corpus.py — corpus receiver and labeling endpoints."""
from __future__ import annotations

import json
import uuid
from pathlib import Path

import pytest
from fastapi.testclient import TestClient

from app.data import log_corpus as lc


VALID_TOKEN = str(uuid.uuid4())
VALID_HOST = "testnode.local"


@pytest.fixture(autouse=True)
def isolated_db(tmp_path, monkeypatch):
    """Each test gets its own fresh corpus DB and config dir."""
    monkeypatch.setattr(lc, "_DATA_DIR", tmp_path)
    monkeypatch.setattr(lc, "_DB_PATH", tmp_path / "corpus.db")
    # Config dir pointing to a temp yaml with one test source
    config_dir = tmp_path / "config"
    config_dir.mkdir()
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  sources:\n"
        f"    - token: \"{VALID_TOKEN}\"\n"
        f"      source_host: \"{VALID_HOST}\"\n"
        f"      owner: TestOwner\n"
        f"      consent_date: \"2026-05-11\"\n"
        f"      consent_method: signal_chat\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
    lc._init_db()


@pytest.fixture()
def client():
    from fastapi import FastAPI
    app = FastAPI()
    app.include_router(lc.router, prefix="/api/corpus")
    return TestClient(app)


def _batch(batch_type="raw_entries", entries=None, source_host=VALID_HOST):
    return {
        "batch_version": 1,
        "batch_id": str(uuid.uuid4()),
        "pushed_at": "2026-05-11T10:00:00Z",
        "source_host": source_host,
        "batch_type": batch_type,
        "watermark_from": 0,
        "watermark_to": 5,
        "entries": entries or [
            {
                "entry_id": str(uuid.uuid4()),
                "source_id": "sonarr",
                "timestamp_iso": "2026-05-11T09:58:00Z",
                "severity": "ERROR",
                "text": "Connection refused to indexer",
                "matched_patterns": [],
            }
        ],
    }


# ── Receive endpoint ───────────────────────────────────────────────────────────

def test_receive_missing_auth(client):
    resp = client.post("/api/corpus/log-batch", json=_batch())
    assert resp.status_code == 401


def test_receive_invalid_token(client):
    resp = client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": "Bearer bad-token"},
    )
    assert resp.status_code == 403


def test_receive_valid_batch(client):
    resp = client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    assert resp.status_code == 200
    data = resp.json()
    assert data["received"] is True
    assert data["entries_stored"] == 1


def test_receive_stores_source_host_from_token_not_payload(client):
    """source_host is always taken from the DB lookup, not the payload."""
    payload = _batch(source_host="attacker-injected-host")
    resp = client.post(
        "/api/corpus/log-batch",
        json=payload,
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    assert resp.status_code == 200
    entries_resp = client.get("/api/corpus/entries")
    entry = entries_resp.json()["entries"][0]
    assert entry["source_host"] == VALID_HOST


def test_receive_skips_empty_text_entries(client):
    payload = _batch(entries=[
        {"entry_id": "e1", "source_id": "svc", "severity": "ERROR", "text": ""},
        {"entry_id": "e2", "source_id": "svc", "severity": "ERROR", "text": "   "},
        {"entry_id": "e3", "source_id": "svc", "severity": "ERROR", "text": "real error"},
    ])
    resp = client.post(
        "/api/corpus/log-batch",
        json=payload,
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    assert resp.json()["entries_stored"] == 1


def test_receive_incident_bundle(client):
    payload = _batch(batch_type="incident_bundles", entries=[
        {"id": "inc-1", "label": "plex crash", "issue_type": "plex",
         "started_at": "2026-05-11T09:00:00", "ended_at": "2026-05-11T09:30:00",
         "notes": "audio dropped", "created_at": "2026-05-11T09:35:00",
         "severity": "high", "text": "plex crash"},
    ])
    resp = client.post(
        "/api/corpus/log-batch",
        json=payload,
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    assert resp.status_code == 200
    assert resp.json()["entries_stored"] == 1


# ── Labeling endpoints ─────────────────────────────────────────────────────────

def test_label_entry(client):
    client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]

    resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={
        "failure_type": "software",
        "plain_explanation": "Sonarr lost connection to its indexer — restart the service.",
        "known_pattern": "y",
    })
    assert resp.status_code == 200
    assert resp.json()["labeled"] is True

    entries = client.get("/api/corpus/entries", params={"state": "labeled"}).json()["entries"]
    assert len(entries) == 1
    assert entries[0]["failure_type"] == "software"


def test_label_entry_invalid_failure_type(client):
    client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
    resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={"failure_type": "aliens"})
    assert resp.status_code == 422


def test_label_entry_missing_failure_type(client):
    client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
    resp = client.post(f"/api/corpus/entries/{entry_id}/label", json={})
    assert resp.status_code == 422


def test_label_entry_not_found(client):
    resp = client.post("/api/corpus/entries/nonexistent/label", json={"failure_type": "software"})
    assert resp.status_code == 404


def test_skip_entry(client):
    client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
    resp = client.post(f"/api/corpus/entries/{entry_id}/skip")
    assert resp.status_code == 200

    unlabeled = client.get("/api/corpus/entries").json()["entries"]
    assert len(unlabeled) == 0


# ── Stats ──────────────────────────────────────────────────────────────────────

def test_stats_empty(client):
    stats = client.get("/api/corpus/stats").json()
    assert stats["total_entries"] == 0
    assert stats["batch_count"] == 0


def test_stats_after_receive(client):
    client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    stats = client.get("/api/corpus/stats").json()
    assert stats["total_entries"] == 1
    assert stats["batch_count"] == 1
    assert stats["by_label_state"].get("unlabeled", 0) == 1


# ── Export ─────────────────────────────────────────────────────────────────────

def test_export_excludes_unlabeled(client):
    client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    resp = client.get("/api/corpus/export")
    assert resp.status_code == 200
    assert resp.text.strip() == ""


def test_export_includes_labeled(client):
    client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
    client.post(f"/api/corpus/entries/{entry_id}/label", json={
        "failure_type": "software",
        "plain_explanation": "Sonarr lost connection to indexer.",
    })

    resp = client.get("/api/corpus/export")
    assert resp.status_code == 200
    lines = [l for l in resp.text.strip().splitlines() if l]
    assert len(lines) == 1
    record = json.loads(lines[0])
    assert record["output"] == "Sonarr lost connection to indexer."
    assert record["metadata"]["failure_type"] == "software"


def test_export_excludes_pii_flagged(client):
    client.post(
        "/api/corpus/log-batch",
        json=_batch(),
        headers={"Authorization": f"Bearer {VALID_TOKEN}"},
    )
    entry_id = client.get("/api/corpus/entries").json()["entries"][0]["id"]
    client.post(f"/api/corpus/entries/{entry_id}/label", json={
        "failure_type": "software",
        "plain_explanation": "Contains username — should not export.",
        "pii_flagged": True,
    })

    resp = client.get("/api/corpus/export")
    assert resp.text.strip() == ""


# ── Pipeline ingest endpoint ───────────────────────────────────────────────────

def _make_pipeline_file(directory: Path, name: str, lines: list[dict]) -> Path:
    """Write a JSONL pipeline log file to directory."""
    p = directory / name
    p.write_text("\n".join(json.dumps(l) for l in lines), encoding="utf-8")
    return p


_PIPELINE_LINE = {
    "ts": "2026-05-17T10:00:00Z",
    "level": "INFO",
    "logger": "scripts.pipeline.purple_carrot_scraper",
    "msg": "Fetched recipe page",
    "extra": {"url": "https://example.com/recipe/1", "status": 200},
}


def test_pipeline_ingest_returns_404_when_dir_not_configured(client, tmp_path):
    """No pipeline_ingest_dir in config — endpoint returns 404."""
    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.status_code == 404


def test_pipeline_ingest_empty_dir(client, tmp_path, monkeypatch):
    """Configured dir exists but is empty — returns zeros, no error."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)

    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.status_code == 200
    data = resp.json()
    assert data["ingested_files"] == 0
    assert data["skipped_files"] == 0
    assert data["entries_stored"] == 0


def test_pipeline_ingest_ingests_valid_file(client, tmp_path, monkeypatch):
    """Valid JSONL file is ingested; entries appear in corpus."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    _make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [
        _PIPELINE_LINE,
        {**_PIPELINE_LINE, "msg": "Saved 3 recipes", "level": "INFO"},
    ])

    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)

    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.status_code == 200
    data = resp.json()
    assert data["ingested_files"] == 1
    assert data["entries_stored"] == 2

    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
    assert len(entries) == 2
    assert all(e["source_host"] == "pipeline_scrape" for e in entries)


def test_pipeline_ingest_source_id_from_logger(client, tmp_path, monkeypatch):
    """source_id is populated from the 'logger' field of each log line."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    _make_pipeline_file(ingest_dir, "run_20260517.jsonl", [_PIPELINE_LINE])

    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)

    client.post("/api/corpus/pipeline-ingest")
    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
    assert entries[0]["source_id"] == "scripts.pipeline.purple_carrot_scraper"


def test_pipeline_ingest_idempotent(client, tmp_path, monkeypatch):
    """Calling the endpoint twice does not re-ingest already-processed files."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    _make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [_PIPELINE_LINE])

    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)

    client.post("/api/corpus/pipeline-ingest")
    resp2 = client.post("/api/corpus/pipeline-ingest")

    data = resp2.json()
    assert data["ingested_files"] == 0
    assert data["skipped_files"] == 1
    assert data["entries_stored"] == 0

    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
    assert len(entries) == 1  # still just the one from the first ingest


def test_pipeline_ingest_skips_non_jsonl(client, tmp_path, monkeypatch):
    """Non-.jsonl files in the dir are silently ignored."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    (ingest_dir / "notes.txt").write_text("this is not a log file")
    (ingest_dir / "run.csv").write_text("a,b,c\n1,2,3")
    _make_pipeline_file(ingest_dir, "valid_20260517.jsonl", [_PIPELINE_LINE])

    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)

    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.json()["ingested_files"] == 1


def test_pipeline_ingest_skips_malformed_lines(client, tmp_path, monkeypatch):
    """Lines that are not valid JSON are skipped; valid lines in the same file still land."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    p = ingest_dir / "mixed_20260517.jsonl"
    p.write_text(
        json.dumps(_PIPELINE_LINE) + "\n"
        "this is not json\n"
        + json.dumps({**_PIPELINE_LINE, "msg": "another valid line"}),
        encoding="utf-8",
    )

    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)

    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.status_code == 200
    assert resp.json()["entries_stored"] == 2  # 2 valid lines, 1 skipped


def test_pipeline_ingest_new_file_after_first_run(client, tmp_path, monkeypatch):
    """A new file added after the first ingest is picked up on the next call."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    _make_pipeline_file(ingest_dir, "run_a.jsonl", [_PIPELINE_LINE])

    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)

    client.post("/api/corpus/pipeline-ingest")  # ingest run_a.jsonl

    _make_pipeline_file(ingest_dir, "run_b.jsonl", [
        {**_PIPELINE_LINE, "msg": "Second run line"},
    ])

    resp2 = client.post("/api/corpus/pipeline-ingest")
    data = resp2.json()
    assert data["ingested_files"] == 1
    assert data["skipped_files"] == 1
    assert data["entries_stored"] == 1