feat(corpus): pipeline log ingest from shared dir (closes #67)
Pull-side companion to kiwi#141. Ingests structured JSONL pipeline logs from /Library/Assets/logs/pipeline/ into the log corpus for Turnstone logreading model training. - app/data/log_corpus.py: add ingested_pipeline_files tracking table, _pipeline_ingest_dir() config helper, _ingest_one_file() parser, and POST /api/corpus/pipeline-ingest endpoint - source_host = "pipeline_scrape"; source_id from logger field; extra dict stored as matched_patterns; batch_type = "pipeline_log" - Idempotent by filename: skips files already in ingested_pipeline_files - config/label_tool.yaml.example: add corpus section with pipeline_ingest_dir and push sources comment block - tests/test_log_corpus.py: 8 new tests covering ingest, idempotency, non-JSONL filtering, malformed line resilience, incremental runs
This commit is contained in:
parent
13ca082a43
commit
9bb88b168f
3 changed files with 308 additions and 0 deletions
|
|
@ -34,6 +34,8 @@ router = APIRouter()
|
||||||
|
|
||||||
_DB_PATH: Path = _ROOT / "data" / "corpus.db"
|
_DB_PATH: Path = _ROOT / "data" / "corpus.db"
|
||||||
|
|
||||||
|
_PIPELINE_SOURCE_HOST = "pipeline_scrape"
|
||||||
|
|
||||||
_SCHEMA = """
|
_SCHEMA = """
|
||||||
CREATE TABLE IF NOT EXISTS corpus_sources (
|
CREATE TABLE IF NOT EXISTS corpus_sources (
|
||||||
token TEXT PRIMARY KEY,
|
token TEXT PRIMARY KEY,
|
||||||
|
|
@ -77,6 +79,12 @@ CREATE TABLE IF NOT EXISTS corpus_entries (
|
||||||
CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state);
|
CREATE INDEX IF NOT EXISTS idx_ce_label_state ON corpus_entries(label_state);
|
||||||
CREATE INDEX IF NOT EXISTS idx_ce_source ON corpus_entries(source_host);
|
CREATE INDEX IF NOT EXISTS idx_ce_source ON corpus_entries(source_host);
|
||||||
CREATE INDEX IF NOT EXISTS idx_ce_severity ON corpus_entries(severity);
|
CREATE INDEX IF NOT EXISTS idx_ce_severity ON corpus_entries(severity);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS ingested_pipeline_files (
|
||||||
|
filename TEXT PRIMARY KEY,
|
||||||
|
ingested_at TEXT NOT NULL,
|
||||||
|
entry_count INTEGER NOT NULL
|
||||||
|
);
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -122,6 +130,19 @@ def _init_db() -> None:
|
||||||
_seed_sources(conn)
|
_seed_sources(conn)
|
||||||
|
|
||||||
|
|
||||||
|
def _pipeline_ingest_dir() -> Path | None:
|
||||||
|
"""Return the configured pipeline log ingest directory, or None if unset."""
|
||||||
|
f = _config_file()
|
||||||
|
if not f.exists():
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||||
|
except yaml.YAMLError:
|
||||||
|
return None
|
||||||
|
val = raw.get("corpus", {}).get("pipeline_ingest_dir", "") or ""
|
||||||
|
return Path(val) if val else None
|
||||||
|
|
||||||
|
|
||||||
def _load_corpus_config() -> list[dict]:
|
def _load_corpus_config() -> list[dict]:
|
||||||
f = _config_file()
|
f = _config_file()
|
||||||
if not f.exists():
|
if not f.exists():
|
||||||
|
|
@ -350,3 +371,92 @@ def export_labeled() -> StreamingResponse:
|
||||||
media_type="application/x-ndjson",
|
media_type="application/x-ndjson",
|
||||||
headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"},
|
headers={"Content-Disposition": "attachment; filename=log_corpus_labeled.jsonl"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── POST /api/corpus/pipeline-ingest ─────────────────────────────────────────
|
||||||
|
|
||||||
|
def _ingest_one_file(conn: sqlite3.Connection, path: Path) -> int:
|
||||||
|
"""Parse a pipeline JSONL file and insert entries. Returns count stored."""
|
||||||
|
batch_id = str(uuid.uuid4())
|
||||||
|
lines = path.read_text(encoding="utf-8").splitlines()
|
||||||
|
entries_raw: list[dict] = []
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
entries_raw.append(json.loads(line))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
logger.debug("Skipping malformed line in %s", path.name)
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO corpus_batches (id, source_host, batch_type, received_at, entry_count, raw_json) "
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?)",
|
||||||
|
(batch_id, _PIPELINE_SOURCE_HOST, "pipeline_log", _now_iso(),
|
||||||
|
len(entries_raw), json.dumps({"file": path.name})),
|
||||||
|
)
|
||||||
|
|
||||||
|
stored = 0
|
||||||
|
for entry in entries_raw:
|
||||||
|
text = (entry.get("msg") or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
conn.execute(
|
||||||
|
"INSERT OR IGNORE INTO corpus_entries "
|
||||||
|
"(id, batch_id, source_host, timestamp_iso, severity, source_id, text, matched_patterns) "
|
||||||
|
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
(str(uuid.uuid4()), batch_id, _PIPELINE_SOURCE_HOST,
|
||||||
|
entry.get("ts"),
|
||||||
|
entry.get("level"),
|
||||||
|
entry.get("logger"),
|
||||||
|
text,
|
||||||
|
json.dumps([entry["extra"]] if entry.get("extra") else [])),
|
||||||
|
)
|
||||||
|
stored += 1
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO ingested_pipeline_files (filename, ingested_at, entry_count) VALUES (?, ?, ?)",
|
||||||
|
(path.name, _now_iso(), stored),
|
||||||
|
)
|
||||||
|
return stored
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/pipeline-ingest")
|
||||||
|
def pipeline_ingest() -> dict:
|
||||||
|
"""Walk the configured pipeline log directory and ingest new JSONL files.
|
||||||
|
|
||||||
|
Skips files already recorded in ingested_pipeline_files. Safe to call
|
||||||
|
repeatedly — idempotent by filename.
|
||||||
|
"""
|
||||||
|
ingest_dir = _pipeline_ingest_dir()
|
||||||
|
if ingest_dir is None:
|
||||||
|
raise HTTPException(404, "pipeline_ingest_dir not configured in label_tool.yaml")
|
||||||
|
|
||||||
|
ingested = 0
|
||||||
|
skipped = 0
|
||||||
|
total_stored = 0
|
||||||
|
files_detail: list[dict] = []
|
||||||
|
|
||||||
|
with _db() as conn:
|
||||||
|
already_done: set[str] = {
|
||||||
|
row[0]
|
||||||
|
for row in conn.execute("SELECT filename FROM ingested_pipeline_files").fetchall()
|
||||||
|
}
|
||||||
|
|
||||||
|
for path in sorted(ingest_dir.glob("*.jsonl")):
|
||||||
|
if path.name in already_done:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
stored = _ingest_one_file(conn, path)
|
||||||
|
ingested += 1
|
||||||
|
total_stored += stored
|
||||||
|
files_detail.append({"file": path.name, "entries_stored": stored})
|
||||||
|
|
||||||
|
logger.info("Pipeline ingest: %d files ingested, %d skipped, %d entries stored",
|
||||||
|
ingested, skipped, total_stored)
|
||||||
|
return {
|
||||||
|
"ingested_files": ingested,
|
||||||
|
"skipped_files": skipped,
|
||||||
|
"entries_stored": total_stored,
|
||||||
|
"files": files_detail,
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -122,6 +122,22 @@ imitate:
|
||||||
text_fields: [title]
|
text_fields: [title]
|
||||||
prompt_template: "Summarize the key rules described in this passage:\n\n{text}"
|
prompt_template: "Summarize the key rules described in this passage:\n\n{text}"
|
||||||
|
|
||||||
|
# ── Log corpus (Turnstone training data) ──────────────────────────────────────
|
||||||
|
corpus:
|
||||||
|
# Directory containing pipeline JSONL log files to ingest (pull-side).
|
||||||
|
# Files named <script>_<ts>.jsonl; one structured record per line.
|
||||||
|
# POST /api/corpus/pipeline-ingest walks this dir and imports new files.
|
||||||
|
# NFS-mounted on both Heimdall and Sif at /Library/Assets/
|
||||||
|
pipeline_ingest_dir: /Library/Assets/logs/pipeline/
|
||||||
|
|
||||||
|
# Turnstone push sources (consent-gated, token-authenticated).
|
||||||
|
# sources:
|
||||||
|
# - token: "your-bearer-token"
|
||||||
|
# source_host: "node.local"
|
||||||
|
# owner: YourName
|
||||||
|
# consent_date: "2026-05-17"
|
||||||
|
# consent_method: signal_chat
|
||||||
|
|
||||||
# ── Embedding model comparison harness ────────────────────────────────────────
|
# ── Embedding model comparison harness ────────────────────────────────────────
|
||||||
embed_bench:
|
embed_bench:
|
||||||
# ollama_url: http://localhost:11434 # optional; falls back to cforch.ollama_url
|
# ollama_url: http://localhost:11434 # optional; falls back to cforch.ollama_url
|
||||||
|
|
|
||||||
|
|
@ -270,3 +270,185 @@ def test_export_excludes_pii_flagged(client):
|
||||||
|
|
||||||
resp = client.get("/api/corpus/export")
|
resp = client.get("/api/corpus/export")
|
||||||
assert resp.text.strip() == ""
|
assert resp.text.strip() == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Pipeline ingest endpoint ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _make_pipeline_file(directory: Path, name: str, lines: list[dict]) -> Path:
|
||||||
|
"""Write a JSONL pipeline log file to directory."""
|
||||||
|
p = directory / name
|
||||||
|
p.write_text("\n".join(json.dumps(l) for l in lines), encoding="utf-8")
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
|
_PIPELINE_LINE = {
|
||||||
|
"ts": "2026-05-17T10:00:00Z",
|
||||||
|
"level": "INFO",
|
||||||
|
"logger": "scripts.pipeline.purple_carrot_scraper",
|
||||||
|
"msg": "Fetched recipe page",
|
||||||
|
"extra": {"url": "https://example.com/recipe/1", "status": 200},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_ingest_returns_404_when_dir_not_configured(client, tmp_path):
|
||||||
|
"""No pipeline_ingest_dir in config — endpoint returns 404."""
|
||||||
|
resp = client.post("/api/corpus/pipeline-ingest")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_ingest_empty_dir(client, tmp_path, monkeypatch):
|
||||||
|
"""Configured dir exists but is empty — returns zeros, no error."""
|
||||||
|
ingest_dir = tmp_path / "pipeline_logs"
|
||||||
|
ingest_dir.mkdir()
|
||||||
|
config_dir = tmp_path / "config"
|
||||||
|
config_dir.mkdir(exist_ok=True)
|
||||||
|
(config_dir / "label_tool.yaml").write_text(
|
||||||
|
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||||
|
|
||||||
|
resp = client.post("/api/corpus/pipeline-ingest")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["ingested_files"] == 0
|
||||||
|
assert data["skipped_files"] == 0
|
||||||
|
assert data["entries_stored"] == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_ingest_ingests_valid_file(client, tmp_path, monkeypatch):
|
||||||
|
"""Valid JSONL file is ingested; entries appear in corpus."""
|
||||||
|
ingest_dir = tmp_path / "pipeline_logs"
|
||||||
|
ingest_dir.mkdir()
|
||||||
|
_make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [
|
||||||
|
_PIPELINE_LINE,
|
||||||
|
{**_PIPELINE_LINE, "msg": "Saved 3 recipes", "level": "INFO"},
|
||||||
|
])
|
||||||
|
|
||||||
|
config_dir = tmp_path / "config"
|
||||||
|
config_dir.mkdir(exist_ok=True)
|
||||||
|
(config_dir / "label_tool.yaml").write_text(
|
||||||
|
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||||
|
|
||||||
|
resp = client.post("/api/corpus/pipeline-ingest")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
data = resp.json()
|
||||||
|
assert data["ingested_files"] == 1
|
||||||
|
assert data["entries_stored"] == 2
|
||||||
|
|
||||||
|
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
|
||||||
|
assert len(entries) == 2
|
||||||
|
assert all(e["source_host"] == "pipeline_scrape" for e in entries)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_ingest_source_id_from_logger(client, tmp_path, monkeypatch):
|
||||||
|
"""source_id is populated from the 'logger' field of each log line."""
|
||||||
|
ingest_dir = tmp_path / "pipeline_logs"
|
||||||
|
ingest_dir.mkdir()
|
||||||
|
_make_pipeline_file(ingest_dir, "run_20260517.jsonl", [_PIPELINE_LINE])
|
||||||
|
|
||||||
|
config_dir = tmp_path / "config"
|
||||||
|
config_dir.mkdir(exist_ok=True)
|
||||||
|
(config_dir / "label_tool.yaml").write_text(
|
||||||
|
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||||
|
|
||||||
|
client.post("/api/corpus/pipeline-ingest")
|
||||||
|
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
|
||||||
|
assert entries[0]["source_id"] == "scripts.pipeline.purple_carrot_scraper"
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_ingest_idempotent(client, tmp_path, monkeypatch):
|
||||||
|
"""Calling the endpoint twice does not re-ingest already-processed files."""
|
||||||
|
ingest_dir = tmp_path / "pipeline_logs"
|
||||||
|
ingest_dir.mkdir()
|
||||||
|
_make_pipeline_file(ingest_dir, "scraper_20260517.jsonl", [_PIPELINE_LINE])
|
||||||
|
|
||||||
|
config_dir = tmp_path / "config"
|
||||||
|
config_dir.mkdir(exist_ok=True)
|
||||||
|
(config_dir / "label_tool.yaml").write_text(
|
||||||
|
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||||
|
|
||||||
|
client.post("/api/corpus/pipeline-ingest")
|
||||||
|
resp2 = client.post("/api/corpus/pipeline-ingest")
|
||||||
|
|
||||||
|
data = resp2.json()
|
||||||
|
assert data["ingested_files"] == 0
|
||||||
|
assert data["skipped_files"] == 1
|
||||||
|
assert data["entries_stored"] == 0
|
||||||
|
|
||||||
|
entries = client.get("/api/corpus/entries", params={"limit": 10}).json()["entries"]
|
||||||
|
assert len(entries) == 1 # still just the one from the first ingest
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_ingest_skips_non_jsonl(client, tmp_path, monkeypatch):
|
||||||
|
"""Non-.jsonl files in the dir are silently ignored."""
|
||||||
|
ingest_dir = tmp_path / "pipeline_logs"
|
||||||
|
ingest_dir.mkdir()
|
||||||
|
(ingest_dir / "notes.txt").write_text("this is not a log file")
|
||||||
|
(ingest_dir / "run.csv").write_text("a,b,c\n1,2,3")
|
||||||
|
_make_pipeline_file(ingest_dir, "valid_20260517.jsonl", [_PIPELINE_LINE])
|
||||||
|
|
||||||
|
config_dir = tmp_path / "config"
|
||||||
|
config_dir.mkdir(exist_ok=True)
|
||||||
|
(config_dir / "label_tool.yaml").write_text(
|
||||||
|
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||||
|
|
||||||
|
resp = client.post("/api/corpus/pipeline-ingest")
|
||||||
|
assert resp.json()["ingested_files"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_ingest_skips_malformed_lines(client, tmp_path, monkeypatch):
|
||||||
|
"""Lines that are not valid JSON are skipped; valid lines in the same file still land."""
|
||||||
|
ingest_dir = tmp_path / "pipeline_logs"
|
||||||
|
ingest_dir.mkdir()
|
||||||
|
p = ingest_dir / "mixed_20260517.jsonl"
|
||||||
|
p.write_text(
|
||||||
|
json.dumps(_PIPELINE_LINE) + "\n"
|
||||||
|
"this is not json\n"
|
||||||
|
+ json.dumps({**_PIPELINE_LINE, "msg": "another valid line"}),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
config_dir = tmp_path / "config"
|
||||||
|
config_dir.mkdir(exist_ok=True)
|
||||||
|
(config_dir / "label_tool.yaml").write_text(
|
||||||
|
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||||
|
|
||||||
|
resp = client.post("/api/corpus/pipeline-ingest")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json()["entries_stored"] == 2 # 2 valid lines, 1 skipped
|
||||||
|
|
||||||
|
|
||||||
|
def test_pipeline_ingest_new_file_after_first_run(client, tmp_path, monkeypatch):
|
||||||
|
"""A new file added after the first ingest is picked up on the next call."""
|
||||||
|
ingest_dir = tmp_path / "pipeline_logs"
|
||||||
|
ingest_dir.mkdir()
|
||||||
|
_make_pipeline_file(ingest_dir, "run_a.jsonl", [_PIPELINE_LINE])
|
||||||
|
|
||||||
|
config_dir = tmp_path / "config"
|
||||||
|
config_dir.mkdir(exist_ok=True)
|
||||||
|
(config_dir / "label_tool.yaml").write_text(
|
||||||
|
f"corpus:\n pipeline_ingest_dir: \"{ingest_dir}\"\n sources: []\n"
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(lc, "_CONFIG_DIR", config_dir)
|
||||||
|
|
||||||
|
client.post("/api/corpus/pipeline-ingest") # ingest run_a.jsonl
|
||||||
|
|
||||||
|
_make_pipeline_file(ingest_dir, "run_b.jsonl", [
|
||||||
|
{**_PIPELINE_LINE, "msg": "Second run line"},
|
||||||
|
])
|
||||||
|
|
||||||
|
resp2 = client.post("/api/corpus/pipeline-ingest")
|
||||||
|
data = resp2.json()
|
||||||
|
assert data["ingested_files"] == 1
|
||||||
|
assert data["skipped_files"] == 1
|
||||||
|
assert data["entries_stored"] == 1
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue