feat(corpus): pipeline log ingest from shared dir (closes #67)

Pull-side companion to kiwi#141. Ingests structured JSONL pipeline logs from /Library/Assets/logs/pipeline/ into the log corpus for Turnstone logreading model training. - app/data/log_corpus.py: add ingested_pipeline_files tracking table, _pipeline_ingest_dir() config helper, _ingest_one_file() parser, and POST /api/corpus/pipeline-ingest endpoint - source_host = "pipeline_scrape"; source_id from logger field; extra dict stored as matched_patterns; batch_type = "pipeline_log" - Idempotent by filename: skips files already in ingested_pipeline_files - config/label_tool.yaml.example: add corpus section with pipeline_ingest_dir and push sources comment block - tests/test_log_corpus.py: 8 new tests covering ingest, idempotency, non-JSONL filtering, malformed line resilience, incremental runs
2026-05-17 11:28:33 -07:00 · 2026-05-17 11:28:33 -07:00 · 9bb88b168f
commit 9bb88b168f
parent 13ca082a43
3 changed files with 308 additions and 0 deletions
--- a/app/data/log_corpus.py
+++ b/app/data/log_corpus.py
@ -34,6 +34,8 @@ router = APIRouter()
 _DB_PATH: Path = _ROOT / "data" / "corpus.db"
 _PIPELINE_SOURCE_HOST = "pipeline_scrape"
 _SCHEMA = """
 CREATE TABLE IF NOT EXISTS corpus_sources (
    token           TEXT PRIMARY KEY,
@ -77,6 +79,12 @@ CREATE TABLE IF NOT EXISTS corpus_entries (
 CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state);
 CREATE INDEX IF NOT EXISTS idx_ce_source      ON corpus_entries(source_host);
 CREATE INDEX IF NOT EXISTS idx_ce_severity    ON corpus_entries(severity);
 CREATE TABLE IF NOT EXISTS ingested_pipeline_files (
    filename    TEXT PRIMARY KEY,
    ingested_at TEXT NOT NULL,
    entry_count INTEGER NOT NULL
 );
 """
@ -122,6 +130,19 @@ def _init_db() -> None:
        _seed_sources(conn)
 def _pipeline_ingest_dir() -> Path | None:
    """Return the configured pipeline log ingest directory, or None if unset."""
    f = _config_file()
    if not f.exists():
        return None
    try:
        raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
    except yaml.YAMLError:
        return None
    val = raw.get("corpus", {}).get("pipeline_ingest_dir", "") or ""
    return Path(val) if val else None
 def _load_corpus_config() -> list[dict]:
    f = _config_file()
    if not f.exists():
@ -350,3 +371,92 @@ def export_labeled() -> StreamingResponse:
        media_type="application/x-ndjson",
        headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"},
    )
 # ── POST /api/corpus/pipeline-ingest ─────────────────────────────────────────
 def _ingest_one_file(conn: sqlite3.Connection, path: Path) -> int:
    """Parse a pipeline JSONL file and insert entries. Returns count stored."""
    batch_id = str(uuid.uuid4())
    lines = path.read_text(encoding="utf-8").splitlines()
    entries_raw: list[dict] = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        try:
            entries_raw.append(json.loads(line))
        except json.JSONDecodeError:
            logger.debug("Skipping malformed line in %s", path.name)
    conn.execute(
        "INSERT INTO corpus_batches (id, source_host, batch_type, received_at, entry_count, raw_json) "
        "VALUES (?, ?, ?, ?, ?, ?)",
        (batch_id, _PIPELINE_SOURCE_HOST, "pipeline_log", _now_iso(),
         len(entries_raw), json.dumps({"file": path.name})),
    )
    stored = 0
    for entry in entries_raw:
        text = (entry.get("msg") or "").strip()
        if not text:
            continue
        conn.execute(
            "INSERT OR IGNORE INTO corpus_entries "
            "(id, batch_id, source_host, timestamp_iso, severity, source_id, text, matched_patterns) "
            "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
            (str(uuid.uuid4()), batch_id, _PIPELINE_SOURCE_HOST,
             entry.get("ts"),
             entry.get("level"),
             entry.get("logger"),
             text,
             json.dumps([entry["extra"]] if entry.get("extra") else [])),
        )
        stored += 1
    conn.execute(
        "INSERT INTO ingested_pipeline_files (filename, ingested_at, entry_count) VALUES (?, ?, ?)",
        (path.name, _now_iso(), stored),
    )
    return stored
@router.post("/pipeline-ingest")
 def pipeline_ingest() -> dict:
    """Walk the configured pipeline log directory and ingest new JSONL files.
    Skips files already recorded in ingested_pipeline_files. Safe to call
    repeatedly — idempotent by filename.
    """
    ingest_dir = _pipeline_ingest_dir()
    if ingest_dir is None:
        raise HTTPException(404, "pipeline_ingest_dir not configured in label_tool.yaml")
    ingested = 0
    skipped = 0
    total_stored = 0
    files_detail: list[dict] = []
    with _db() as conn:
        already_done: set[str] = {
            row[0]
            for row in conn.execute("SELECT filename FROM ingested_pipeline_files").fetchall()
        }
        for path in sorted(ingest_dir.glob("*.jsonl")):
            if path.name in already_done:
                skipped += 1
                continue
            stored = _ingest_one_file(conn, path)
            ingested += 1
            total_stored += stored
            files_detail.append({"file": path.name, "entries_stored": stored})
    logger.info("Pipeline ingest: %d files ingested, %d skipped, %d entries stored",
                ingested, skipped, total_stored)
    return {
        "ingested_files": ingested,
        "skipped_files": skipped,
        "entries_stored": total_stored,
        "files": files_detail,
    }
--- a/config/label_tool.yaml.example
+++ b/config/label_tool.yaml.example
@ -122,6 +122,22 @@ imitate:
      text_fields: [title]
      prompt_template: "Summarize the key rules described in this passage:\n\n{text}"
 # ── Log corpus (Turnstone training data) ──────────────────────────────────────
 corpus:
  # Directory containing pipeline JSONL log files to ingest (pull-side).
  # Files named <script>_<ts>.jsonl; one structured record per line.
  # POST /api/corpus/pipeline-ingest walks this dir and imports new files.
  # NFS-mounted on both Heimdall and Sif at /Library/Assets/
  pipeline_ingest_dir: /Library/Assets/logs/pipeline/
  # Turnstone push sources (consent-gated, token-authenticated).
  # sources:
  #   - token: "your-bearer-token"
  #     source_host: "node.local"
  #     owner: YourName
  #     consent_date: "2026-05-17"
  #     consent_method: signal_chat
 # ── Embedding model comparison harness ────────────────────────────────────────
 embed_bench:
  # ollama_url: http://localhost:11434   # optional; falls back to cforch.ollama_url
--- a/tests/test_log_corpus.py
+++ b/tests/test_log_corpus.py
@ -270,3 +270,185 @@ def test_export_excludes_pii_flagged(client):
    resp = client.get("/api/corpus/export")
    assert resp.text.strip() == ""
 # ── Pipeline ingest endpoint ───────────────────────────────────────────────────
 def _make_pipeline_file(directory: Path, name: str, lines: list[dict]) -> Path:
    """Write a JSONL pipeline log file to directory."""
    p = directory / name
    p.write_text("\n".join(json.dumps(l) for l in lines), encoding="utf-8")
    return p
 _PIPELINE_LINE = {
    "ts": "2026-05-17T10:00:00Z",
    "level": "INFO",
    "logger": "scripts.pipeline.purple_carrot_scraper",
    "msg": "Fetched recipe page",
    "extra": {"url": "https://example.com/recipe/1", "status": 200},
 }
 def test_pipeline_ingest_returns_404_when_dir_not_configured(client, tmp_path):
    """No pipeline_ingest_dir in config — endpoint returns 404."""
    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.status_code == 404
 def test_pipeline_ingest_empty_dir(client, tmp_path, monkeypatch):
    """Configured dir exists but is empty — returns zeros, no error."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.status_code == 200
    data = resp.json()
    assert data["ingested_files"] == 0
    assert data["skipped_files"] == 0
    assert data["entries_stored"] == 0
 def test_pipeline_ingest_ingests_valid_file(client, tmp_path, monkeypatch):
    """Valid JSONL file is ingested; entries appear in corpus."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    _make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [
        _PIPELINE_LINE,
        {**_PIPELINE_LINE, "msg": "Saved 3 recipes", "level": "INFO"},
    ])
    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.status_code == 200
    data = resp.json()
    assert data["ingested_files"] == 1
    assert data["entries_stored"] == 2
    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
    assert len(entries) == 2
    assert all(e["source_host"] == "pipeline_scrape" for e in entries)
 def test_pipeline_ingest_source_id_from_logger(client, tmp_path, monkeypatch):
    """source_id is populated from the 'logger' field of each log line."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    _make_pipeline_file(ingest_dir, "run_20260517.jsonl", [_PIPELINE_LINE])
    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
    client.post("/api/corpus/pipeline-ingest")
    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
    assert entries[0]["source_id"] == "scripts.pipeline.purple_carrot_scraper"
 def test_pipeline_ingest_idempotent(client, tmp_path, monkeypatch):
    """Calling the endpoint twice does not re-ingest already-processed files."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    _make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [_PIPELINE_LINE])
    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
    client.post("/api/corpus/pipeline-ingest")
    resp2 = client.post("/api/corpus/pipeline-ingest")
    data = resp2.json()
    assert data["ingested_files"] == 0
    assert data["skipped_files"] == 1
    assert data["entries_stored"] == 0
    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
    assert len(entries) == 1  # still just the one from the first ingest
 def test_pipeline_ingest_skips_non_jsonl(client, tmp_path, monkeypatch):
    """Non-.jsonl files in the dir are silently ignored."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    (ingest_dir / "notes.txt").write_text("this is not a log file")
    (ingest_dir / "run.csv").write_text("a,b,c\n1,2,3")
    _make_pipeline_file(ingest_dir, "valid_20260517.jsonl", [_PIPELINE_LINE])
    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.json()["ingested_files"] == 1
 def test_pipeline_ingest_skips_malformed_lines(client, tmp_path, monkeypatch):
    """Lines that are not valid JSON are skipped; valid lines in the same file still land."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    p = ingest_dir / "mixed_20260517.jsonl"
    p.write_text(
        json.dumps(_PIPELINE_LINE) + "\n"
        "this is not json\n"
        + json.dumps({**_PIPELINE_LINE, "msg": "another valid line"}),
        encoding="utf-8",
    )
    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
    resp = client.post("/api/corpus/pipeline-ingest")
    assert resp.status_code == 200
    assert resp.json()["entries_stored"] == 2  # 2 valid lines, 1 skipped
 def test_pipeline_ingest_new_file_after_first_run(client, tmp_path, monkeypatch):
    """A new file added after the first ingest is picked up on the next call."""
    ingest_dir = tmp_path / "pipeline_logs"
    ingest_dir.mkdir()
    _make_pipeline_file(ingest_dir, "run_a.jsonl", [_PIPELINE_LINE])
    config_dir = tmp_path / "config"
    config_dir.mkdir(exist_ok=True)
    (config_dir / "label_tool.yaml").write_text(
        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
    )
    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
    client.post("/api/corpus/pipeline-ingest")  # ingest run_a.jsonl
    _make_pipeline_file(ingest_dir, "run_b.jsonl", [
        {**_PIPELINE_LINE, "msg": "Second run line"},
    ])
    resp2 = client.post("/api/corpus/pipeline-ingest")
    data = resp2.json()
    assert data["ingested_files"] == 1
    assert data["skipped_files"] == 1
    assert data["entries_stored"] == 1