feat(corpus): pipeline log ingest from shared dir (closes #67)

Pull-side companion to kiwi#141. Ingests structured JSONL pipeline logs
from /Library/Assets/logs/pipeline/ into the log corpus for Turnstone
logreading model training.

- app/data/log_corpus.py: add ingested_pipeline_files tracking table,
  _pipeline_ingest_dir() config helper, _ingest_one_file() parser, and
  POST /api/corpus/pipeline-ingest endpoint
- source_host = "pipeline_scrape"; source_id from logger field; extra
  dict stored as matched_patterns; batch_type = "pipeline_log"
- Idempotent by filename: skips files already in ingested_pipeline_files
- config/label_tool.yaml.example: add corpus section with pipeline_ingest_dir
  and push sources comment block
- tests/test_log_corpus.py: 8 new tests covering ingest, idempotency,
  non-JSONL filtering, malformed line resilience, incremental runs
This commit is contained in:
pyr0ball 2026-05-17 11:28:33 -07:00
parent 13ca082a43
commit 9bb88b168f
3 changed files with 308 additions and 0 deletions

View file

@ -34,6 +34,8 @@ router = APIRouter()
_DB_PATH: Path = _ROOT / "data" / "corpus.db" _DB_PATH: Path = _ROOT / "data" / "corpus.db"
_PIPELINE_SOURCE_HOST = "pipeline_scrape"
_SCHEMA = """ _SCHEMA = """
CREATE TABLE IF NOT EXISTS corpus_sources ( CREATE TABLE IF NOT EXISTS corpus_sources (
token TEXT PRIMARY KEY, token TEXT PRIMARY KEY,
@ -77,6 +79,12 @@ CREATE TABLE IF NOT EXISTS corpus_entries (
CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state); CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state);
CREATE INDEX IF NOT EXISTS idx_ce_source ON corpus_entries(source_host); CREATE INDEX IF NOT EXISTS idx_ce_source ON corpus_entries(source_host);
CREATE INDEX IF NOT EXISTS idx_ce_severity ON corpus_entries(severity); CREATE INDEX IF NOT EXISTS idx_ce_severity ON corpus_entries(severity);
CREATE TABLE IF NOT EXISTS ingested_pipeline_files (
filename TEXT PRIMARY KEY,
ingested_at TEXT NOT NULL,
entry_count INTEGER NOT NULL
);
""" """
@ -122,6 +130,19 @@ def _init_db() -> None:
_seed_sources(conn) _seed_sources(conn)
def _pipeline_ingest_dir() -> Path | None:
"""Return the configured pipeline log ingest directory, or None if unset."""
f = _config_file()
if not f.exists():
return None
try:
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
except yaml.YAMLError:
return None
val = raw.get("corpus", {}).get("pipeline_ingest_dir", "") or ""
return Path(val) if val else None
def _load_corpus_config() -> list[dict]: def _load_corpus_config() -> list[dict]:
f = _config_file() f = _config_file()
if not f.exists(): if not f.exists():
@ -350,3 +371,92 @@ def export_labeled() -> StreamingResponse:
media_type="application/x-ndjson", media_type="application/x-ndjson",
headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"}, headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"},
) )
# ── POST /api/corpus/pipeline-ingest ─────────────────────────────────────────
def _ingest_one_file(conn: sqlite3.Connection, path: Path) -> int:
"""Parse a pipeline JSONL file and insert entries. Returns count stored."""
batch_id = str(uuid.uuid4())
lines = path.read_text(encoding="utf-8").splitlines()
entries_raw: list[dict] = []
for line in lines:
line = line.strip()
if not line:
continue
try:
entries_raw.append(json.loads(line))
except json.JSONDecodeError:
logger.debug("Skipping malformed line in %s", path.name)
conn.execute(
"INSERT INTO corpus_batches (id, source_host, batch_type, received_at, entry_count, raw_json) "
"VALUES (?, ?, ?, ?, ?, ?)",
(batch_id, _PIPELINE_SOURCE_HOST, "pipeline_log", _now_iso(),
len(entries_raw), json.dumps({"file": path.name})),
)
stored = 0
for entry in entries_raw:
text = (entry.get("msg") or "").strip()
if not text:
continue
conn.execute(
"INSERT OR IGNORE INTO corpus_entries "
"(id, batch_id, source_host, timestamp_iso, severity, source_id, text, matched_patterns) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(str(uuid.uuid4()), batch_id, _PIPELINE_SOURCE_HOST,
entry.get("ts"),
entry.get("level"),
entry.get("logger"),
text,
json.dumps([entry["extra"]] if entry.get("extra") else [])),
)
stored += 1
conn.execute(
"INSERT INTO ingested_pipeline_files (filename, ingested_at, entry_count) VALUES (?, ?, ?)",
(path.name, _now_iso(), stored),
)
return stored
@router.post("/pipeline-ingest")
def pipeline_ingest() -> dict:
"""Walk the configured pipeline log directory and ingest new JSONL files.
Skips files already recorded in ingested_pipeline_files. Safe to call
repeatedly idempotent by filename.
"""
ingest_dir = _pipeline_ingest_dir()
if ingest_dir is None:
raise HTTPException(404, "pipeline_ingest_dir not configured in label_tool.yaml")
ingested = 0
skipped = 0
total_stored = 0
files_detail: list[dict] = []
with _db() as conn:
already_done: set[str] = {
row[0]
for row in conn.execute("SELECT filename FROM ingested_pipeline_files").fetchall()
}
for path in sorted(ingest_dir.glob("*.jsonl")):
if path.name in already_done:
skipped += 1
continue
stored = _ingest_one_file(conn, path)
ingested += 1
total_stored += stored
files_detail.append({"file": path.name, "entries_stored": stored})
logger.info("Pipeline ingest: %d files ingested, %d skipped, %d entries stored",
ingested, skipped, total_stored)
return {
"ingested_files": ingested,
"skipped_files": skipped,
"entries_stored": total_stored,
"files": files_detail,
}

View file

@ -122,6 +122,22 @@ imitate:
text_fields: [title] text_fields: [title]
prompt_template: "Summarize the key rules described in this passage:\n\n{text}" prompt_template: "Summarize the key rules described in this passage:\n\n{text}"
# ── Log corpus (Turnstone training data) ──────────────────────────────────────
corpus:
# Directory containing pipeline JSONL log files to ingest (pull-side).
# Files named <script>_<ts>.jsonl; one structured record per line.
# POST /api/corpus/pipeline-ingest walks this dir and imports new files.
# NFS-mounted on both Heimdall and Sif at /Library/Assets/
pipeline_ingest_dir: /Library/Assets/logs/pipeline/
# Turnstone push sources (consent-gated, token-authenticated).
# sources:
# - token: "your-bearer-token"
# source_host: "node.local"
# owner: YourName
# consent_date: "2026-05-17"
# consent_method: signal_chat
# ── Embedding model comparison harness ──────────────────────────────────────── # ── Embedding model comparison harness ────────────────────────────────────────
embed_bench: embed_bench:
# ollama_url: http://localhost:11434 # optional; falls back to cforch.ollama_url # ollama_url: http://localhost:11434 # optional; falls back to cforch.ollama_url

View file

@ -270,3 +270,185 @@ def test_export_excludes_pii_flagged(client):
resp = client.get("/api/corpus/export") resp = client.get("/api/corpus/export")
assert resp.text.strip() == "" assert resp.text.strip() == ""
# ── Pipeline ingest endpoint ───────────────────────────────────────────────────
def _make_pipeline_file(directory: Path, name: str, lines: list[dict]) -> Path:
"""Write a JSONL pipeline log file to directory."""
p = directory / name
p.write_text("\n".join(json.dumps(l) for l in lines), encoding="utf-8")
return p
_PIPELINE_LINE = {
"ts": "2026-05-17T10:00:00Z",
"level": "INFO",
"logger": "scripts.pipeline.purple_carrot_scraper",
"msg": "Fetched recipe page",
"extra": {"url": "https://example.com/recipe/1", "status": 200},
}
def test_pipeline_ingest_returns_404_when_dir_not_configured(client, tmp_path):
"""No pipeline_ingest_dir in config — endpoint returns 404."""
resp = client.post("/api/corpus/pipeline-ingest")
assert resp.status_code == 404
def test_pipeline_ingest_empty_dir(client, tmp_path, monkeypatch):
"""Configured dir exists but is empty — returns zeros, no error."""
ingest_dir = tmp_path / "pipeline_logs"
ingest_dir.mkdir()
config_dir = tmp_path / "config"
config_dir.mkdir(exist_ok=True)
(config_dir / "label_tool.yaml").write_text(
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
)
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
resp = client.post("/api/corpus/pipeline-ingest")
assert resp.status_code == 200
data = resp.json()
assert data["ingested_files"] == 0
assert data["skipped_files"] == 0
assert data["entries_stored"] == 0
def test_pipeline_ingest_ingests_valid_file(client, tmp_path, monkeypatch):
"""Valid JSONL file is ingested; entries appear in corpus."""
ingest_dir = tmp_path / "pipeline_logs"
ingest_dir.mkdir()
_make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [
_PIPELINE_LINE,
{**_PIPELINE_LINE, "msg": "Saved 3 recipes", "level": "INFO"},
])
config_dir = tmp_path / "config"
config_dir.mkdir(exist_ok=True)
(config_dir / "label_tool.yaml").write_text(
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
)
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
resp = client.post("/api/corpus/pipeline-ingest")
assert resp.status_code == 200
data = resp.json()
assert data["ingested_files"] == 1
assert data["entries_stored"] == 2
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
assert len(entries) == 2
assert all(e["source_host"] == "pipeline_scrape" for e in entries)
def test_pipeline_ingest_source_id_from_logger(client, tmp_path, monkeypatch):
"""source_id is populated from the 'logger' field of each log line."""
ingest_dir = tmp_path / "pipeline_logs"
ingest_dir.mkdir()
_make_pipeline_file(ingest_dir, "run_20260517.jsonl", [_PIPELINE_LINE])
config_dir = tmp_path / "config"
config_dir.mkdir(exist_ok=True)
(config_dir / "label_tool.yaml").write_text(
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
)
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
client.post("/api/corpus/pipeline-ingest")
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
assert entries[0]["source_id"] == "scripts.pipeline.purple_carrot_scraper"
def test_pipeline_ingest_idempotent(client, tmp_path, monkeypatch):
"""Calling the endpoint twice does not re-ingest already-processed files."""
ingest_dir = tmp_path / "pipeline_logs"
ingest_dir.mkdir()
_make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [_PIPELINE_LINE])
config_dir = tmp_path / "config"
config_dir.mkdir(exist_ok=True)
(config_dir / "label_tool.yaml").write_text(
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
)
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
client.post("/api/corpus/pipeline-ingest")
resp2 = client.post("/api/corpus/pipeline-ingest")
data = resp2.json()
assert data["ingested_files"] == 0
assert data["skipped_files"] == 1
assert data["entries_stored"] == 0
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
assert len(entries) == 1 # still just the one from the first ingest
def test_pipeline_ingest_skips_non_jsonl(client, tmp_path, monkeypatch):
"""Non-.jsonl files in the dir are silently ignored."""
ingest_dir = tmp_path / "pipeline_logs"
ingest_dir.mkdir()
(ingest_dir / "notes.txt").write_text("this is not a log file")
(ingest_dir / "run.csv").write_text("a,b,c\n1,2,3")
_make_pipeline_file(ingest_dir, "valid_20260517.jsonl", [_PIPELINE_LINE])
config_dir = tmp_path / "config"
config_dir.mkdir(exist_ok=True)
(config_dir / "label_tool.yaml").write_text(
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
)
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
resp = client.post("/api/corpus/pipeline-ingest")
assert resp.json()["ingested_files"] == 1
def test_pipeline_ingest_skips_malformed_lines(client, tmp_path, monkeypatch):
"""Lines that are not valid JSON are skipped; valid lines in the same file still land."""
ingest_dir = tmp_path / "pipeline_logs"
ingest_dir.mkdir()
p = ingest_dir / "mixed_20260517.jsonl"
p.write_text(
json.dumps(_PIPELINE_LINE) + "\n"
"this is not json\n"
+ json.dumps({**_PIPELINE_LINE, "msg": "another valid line"}),
encoding="utf-8",
)
config_dir = tmp_path / "config"
config_dir.mkdir(exist_ok=True)
(config_dir / "label_tool.yaml").write_text(
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
)
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
resp = client.post("/api/corpus/pipeline-ingest")
assert resp.status_code == 200
assert resp.json()["entries_stored"] == 2 # 2 valid lines, 1 skipped
def test_pipeline_ingest_new_file_after_first_run(client, tmp_path, monkeypatch):
"""A new file added after the first ingest is picked up on the next call."""
ingest_dir = tmp_path / "pipeline_logs"
ingest_dir.mkdir()
_make_pipeline_file(ingest_dir, "run_a.jsonl", [_PIPELINE_LINE])
config_dir = tmp_path / "config"
config_dir.mkdir(exist_ok=True)
(config_dir / "label_tool.yaml").write_text(
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
)
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
client.post("/api/corpus/pipeline-ingest") # ingest run_a.jsonl
_make_pipeline_file(ingest_dir, "run_b.jsonl", [
{**_PIPELINE_LINE, "msg": "Second run line"},
])
resp2 = client.post("/api/corpus/pipeline-ingest")
data = resp2.json()
assert data["ingested_files"] == 1
assert data["skipped_files"] == 1
assert data["entries_stored"] == 1