feat(corpus): pipeline log ingest from shared dir (closes #67)

Pull-side companion to kiwi#141. Ingests structured JSONL pipeline logs from /Library/Assets/logs/pipeline/ into the log corpus for Turnstone logreading model training. - app/data/log_corpus.py: add ingested_pipeline_files tracking table, _pipeline_ingest_dir() config helper, _ingest_one_file() parser, and POST /api/corpus/pipeline-ingest endpoint - source_host = "pipeline_scrape"; source_id from logger field; extra dict stored as matched_patterns; batch_type = "pipeline_log" - Idempotent by filename: skips files already in ingested_pipeline_files - config/label_tool.yaml.example: add corpus section with pipeline_ingest_dir and push sources comment block - tests/test_log_corpus.py: 8 new tests covering ingest, idempotency, non-JSONL filtering, malformed line resilience, incremental runs
2026-05-17 11:28:33 -07:00 · 2026-05-17 11:28:33 -07:00 · 9bb88b168f
commit 9bb88b168f
parent 13ca082a43
3 changed files with 308 additions and 0 deletions
--- a/app/data/log_corpus.py
+++ b/app/data/log_corpus.py
@ -34,6 +34,8 @@ router = APIRouter()

 _DB_PATH: Path = _ROOT / "data" / "corpus.db"

+_PIPELINE_SOURCE_HOST = "pipeline_scrape"
+
 _SCHEMA = """
 CREATE TABLE IF NOT EXISTS corpus_sources (
    token           TEXT PRIMARY KEY,
@ -77,6 +79,12 @@ CREATE TABLE IF NOT EXISTS corpus_entries (
 CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state);
 CREATE INDEX IF NOT EXISTS idx_ce_source      ON corpus_entries(source_host);
 CREATE INDEX IF NOT EXISTS idx_ce_severity    ON corpus_entries(severity);
+
+CREATE TABLE IF NOT EXISTS ingested_pipeline_files (
+    filename    TEXT PRIMARY KEY,
+    ingested_at TEXT NOT NULL,
+    entry_count INTEGER NOT NULL
+);
 """


@ -122,6 +130,19 @@ def _init_db() -> None:
        _seed_sources(conn)


+def _pipeline_ingest_dir() -> Path | None:
+    """Return the configured pipeline log ingest directory, or None if unset."""
+    f = _config_file()
+    if not f.exists():
+        return None
+    try:
+        raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
+    except yaml.YAMLError:
+        return None
+    val = raw.get("corpus", {}).get("pipeline_ingest_dir", "") or ""
+    return Path(val) if val else None
+
+
 def _load_corpus_config() -> list[dict]:
    f = _config_file()
    if not f.exists():
@ -350,3 +371,92 @@ def export_labeled() -> StreamingResponse:
        media_type="application/x-ndjson",
        headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"},
    )
+
+
+# ── POST /api/corpus/pipeline-ingest ─────────────────────────────────────────
+
+def _ingest_one_file(conn: sqlite3.Connection, path: Path) -> int:
+    """Parse a pipeline JSONL file and insert entries. Returns count stored."""
+    batch_id = str(uuid.uuid4())
+    lines = path.read_text(encoding="utf-8").splitlines()
+    entries_raw: list[dict] = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            entries_raw.append(json.loads(line))
+        except json.JSONDecodeError:
+            logger.debug("Skipping malformed line in %s", path.name)
+
+    conn.execute(
+        "INSERT INTO corpus_batches (id, source_host, batch_type, received_at, entry_count, raw_json) "
+        "VALUES (?, ?, ?, ?, ?, ?)",
+        (batch_id, _PIPELINE_SOURCE_HOST, "pipeline_log", _now_iso(),
+         len(entries_raw), json.dumps({"file": path.name})),
+    )
+
+    stored = 0
+    for entry in entries_raw:
+        text = (entry.get("msg") or "").strip()
+        if not text:
+            continue
+        conn.execute(
+            "INSERT OR IGNORE INTO corpus_entries "
+            "(id, batch_id, source_host, timestamp_iso, severity, source_id, text, matched_patterns) "
+            "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
+            (str(uuid.uuid4()), batch_id, _PIPELINE_SOURCE_HOST,
+             entry.get("ts"),
+             entry.get("level"),
+             entry.get("logger"),
+             text,
+             json.dumps([entry["extra"]] if entry.get("extra") else [])),
+        )
+        stored += 1
+
+    conn.execute(
+        "INSERT INTO ingested_pipeline_files (filename, ingested_at, entry_count) VALUES (?, ?, ?)",
+        (path.name, _now_iso(), stored),
+    )
+    return stored
+
+
+@router.post("/pipeline-ingest")
+def pipeline_ingest() -> dict:
+    """Walk the configured pipeline log directory and ingest new JSONL files.
+
+    Skips files already recorded in ingested_pipeline_files. Safe to call
+    repeatedly — idempotent by filename.
+    """
+    ingest_dir = _pipeline_ingest_dir()
+    if ingest_dir is None:
+        raise HTTPException(404, "pipeline_ingest_dir not configured in label_tool.yaml")
+
+    ingested = 0
+    skipped = 0
+    total_stored = 0
+    files_detail: list[dict] = []
+
+    with _db() as conn:
+        already_done: set[str] = {
+            row[0]
+            for row in conn.execute("SELECT filename FROM ingested_pipeline_files").fetchall()
+        }
+
+        for path in sorted(ingest_dir.glob("*.jsonl")):
+            if path.name in already_done:
+                skipped += 1
+                continue
+            stored = _ingest_one_file(conn, path)
+            ingested += 1
+            total_stored += stored
+            files_detail.append({"file": path.name, "entries_stored": stored})
+
+    logger.info("Pipeline ingest: %d files ingested, %d skipped, %d entries stored",
+                ingested, skipped, total_stored)
+    return {
+        "ingested_files": ingested,
+        "skipped_files": skipped,
+        "entries_stored": total_stored,
+        "files": files_detail,
+    }
--- a/config/label_tool.yaml.example
+++ b/config/label_tool.yaml.example
@ -122,6 +122,22 @@ imitate:
      text_fields: [title]
      prompt_template: "Summarize the key rules described in this passage:\n\n{text}"

+# ── Log corpus (Turnstone training data) ──────────────────────────────────────
+corpus:
+  # Directory containing pipeline JSONL log files to ingest (pull-side).
+  # Files named <script>_<ts>.jsonl; one structured record per line.
+  # POST /api/corpus/pipeline-ingest walks this dir and imports new files.
+  # NFS-mounted on both Heimdall and Sif at /Library/Assets/
+  pipeline_ingest_dir: /Library/Assets/logs/pipeline/
+
+  # Turnstone push sources (consent-gated, token-authenticated).
+  # sources:
+  #   - token: "your-bearer-token"
+  #     source_host: "node.local"
+  #     owner: YourName
+  #     consent_date: "2026-05-17"
+  #     consent_method: signal_chat
+
 # ── Embedding model comparison harness ────────────────────────────────────────
 embed_bench:
  # ollama_url: http://localhost:11434   # optional; falls back to cforch.ollama_url
--- a/tests/test_log_corpus.py
+++ b/tests/test_log_corpus.py
@ -270,3 +270,185 @@ def test_export_excludes_pii_flagged(client):

    resp = client.get("/api/corpus/export")
    assert resp.text.strip() == ""
+
+
+# ── Pipeline ingest endpoint ───────────────────────────────────────────────────
+
+def _make_pipeline_file(directory: Path, name: str, lines: list[dict]) -> Path:
+    """Write a JSONL pipeline log file to directory."""
+    p = directory / name
+    p.write_text("\n".join(json.dumps(l) for l in lines), encoding="utf-8")
+    return p
+
+
+_PIPELINE_LINE = {
+    "ts": "2026-05-17T10:00:00Z",
+    "level": "INFO",
+    "logger": "scripts.pipeline.purple_carrot_scraper",
+    "msg": "Fetched recipe page",
+    "extra": {"url": "https://example.com/recipe/1", "status": 200},
+}
+
+
+def test_pipeline_ingest_returns_404_when_dir_not_configured(client, tmp_path):
+    """No pipeline_ingest_dir in config — endpoint returns 404."""
+    resp = client.post("/api/corpus/pipeline-ingest")
+    assert resp.status_code == 404
+
+
+def test_pipeline_ingest_empty_dir(client, tmp_path, monkeypatch):
+    """Configured dir exists but is empty — returns zeros, no error."""
+    ingest_dir = tmp_path / "pipeline_logs"
+    ingest_dir.mkdir()
+    config_dir = tmp_path / "config"
+    config_dir.mkdir(exist_ok=True)
+    (config_dir / "label_tool.yaml").write_text(
+        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
+    )
+    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
+
+    resp = client.post("/api/corpus/pipeline-ingest")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["ingested_files"] == 0
+    assert data["skipped_files"] == 0
+    assert data["entries_stored"] == 0
+
+
+def test_pipeline_ingest_ingests_valid_file(client, tmp_path, monkeypatch):
+    """Valid JSONL file is ingested; entries appear in corpus."""
+    ingest_dir = tmp_path / "pipeline_logs"
+    ingest_dir.mkdir()
+    _make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [
+        _PIPELINE_LINE,
+        {**_PIPELINE_LINE, "msg": "Saved 3 recipes", "level": "INFO"},
+    ])
+
+    config_dir = tmp_path / "config"
+    config_dir.mkdir(exist_ok=True)
+    (config_dir / "label_tool.yaml").write_text(
+        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
+    )
+    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
+
+    resp = client.post("/api/corpus/pipeline-ingest")
+    assert resp.status_code == 200
+    data = resp.json()
+    assert data["ingested_files"] == 1
+    assert data["entries_stored"] == 2
+
+    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
+    assert len(entries) == 2
+    assert all(e["source_host"] == "pipeline_scrape" for e in entries)
+
+
+def test_pipeline_ingest_source_id_from_logger(client, tmp_path, monkeypatch):
+    """source_id is populated from the 'logger' field of each log line."""
+    ingest_dir = tmp_path / "pipeline_logs"
+    ingest_dir.mkdir()
+    _make_pipeline_file(ingest_dir, "run_20260517.jsonl", [_PIPELINE_LINE])
+
+    config_dir = tmp_path / "config"
+    config_dir.mkdir(exist_ok=True)
+    (config_dir / "label_tool.yaml").write_text(
+        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
+    )
+    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
+
+    client.post("/api/corpus/pipeline-ingest")
+    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
+    assert entries[0]["source_id"] == "scripts.pipeline.purple_carrot_scraper"
+
+
+def test_pipeline_ingest_idempotent(client, tmp_path, monkeypatch):
+    """Calling the endpoint twice does not re-ingest already-processed files."""
+    ingest_dir = tmp_path / "pipeline_logs"
+    ingest_dir.mkdir()
+    _make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [_PIPELINE_LINE])
+
+    config_dir = tmp_path / "config"
+    config_dir.mkdir(exist_ok=True)
+    (config_dir / "label_tool.yaml").write_text(
+        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
+    )
+    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
+
+    client.post("/api/corpus/pipeline-ingest")
+    resp2 = client.post("/api/corpus/pipeline-ingest")
+
+    data = resp2.json()
+    assert data["ingested_files"] == 0
+    assert data["skipped_files"] == 1
+    assert data["entries_stored"] == 0
+
+    entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
+    assert len(entries) == 1  # still just the one from the first ingest
+
+
+def test_pipeline_ingest_skips_non_jsonl(client, tmp_path, monkeypatch):
+    """Non-.jsonl files in the dir are silently ignored."""
+    ingest_dir = tmp_path / "pipeline_logs"
+    ingest_dir.mkdir()
+    (ingest_dir / "notes.txt").write_text("this is not a log file")
+    (ingest_dir / "run.csv").write_text("a,b,c\n1,2,3")
+    _make_pipeline_file(ingest_dir, "valid_20260517.jsonl", [_PIPELINE_LINE])
+
+    config_dir = tmp_path / "config"
+    config_dir.mkdir(exist_ok=True)
+    (config_dir / "label_tool.yaml").write_text(
+        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
+    )
+    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
+
+    resp = client.post("/api/corpus/pipeline-ingest")
+    assert resp.json()["ingested_files"] == 1
+
+
+def test_pipeline_ingest_skips_malformed_lines(client, tmp_path, monkeypatch):
+    """Lines that are not valid JSON are skipped; valid lines in the same file still land."""
+    ingest_dir = tmp_path / "pipeline_logs"
+    ingest_dir.mkdir()
+    p = ingest_dir / "mixed_20260517.jsonl"
+    p.write_text(
+        json.dumps(_PIPELINE_LINE) + "\n"
+        "this is not json\n"
+        + json.dumps({**_PIPELINE_LINE, "msg": "another valid line"}),
+        encoding="utf-8",
+    )
+
+    config_dir = tmp_path / "config"
+    config_dir.mkdir(exist_ok=True)
+    (config_dir / "label_tool.yaml").write_text(
+        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
+    )
+    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
+
+    resp = client.post("/api/corpus/pipeline-ingest")
+    assert resp.status_code == 200
+    assert resp.json()["entries_stored"] == 2  # 2 valid lines, 1 skipped
+
+
+def test_pipeline_ingest_new_file_after_first_run(client, tmp_path, monkeypatch):
+    """A new file added after the first ingest is picked up on the next call."""
+    ingest_dir = tmp_path / "pipeline_logs"
+    ingest_dir.mkdir()
+    _make_pipeline_file(ingest_dir, "run_a.jsonl", [_PIPELINE_LINE])
+
+    config_dir = tmp_path / "config"
+    config_dir.mkdir(exist_ok=True)
+    (config_dir / "label_tool.yaml").write_text(
+        f"corpus:\n  pipeline_ingest_dir: \"{ingest_dir}\"\n  sources: []\n"
+    )
+    monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
+
+    client.post("/api/corpus/pipeline-ingest")  # ingest run_a.jsonl
+
+    _make_pipeline_file(ingest_dir, "run_b.jsonl", [
+        {**_PIPELINE_LINE, "msg": "Second run line"},
+    ])
+
+    resp2 = client.post("/api/corpus/pipeline-ingest")
+    data = resp2.json()
+    assert data["ingested_files"] == 1
+    assert data["skipped_files"] == 1
+    assert data["entries_stored"] == 1