From 60fe1231ceaa99bb20b7b9cb03b7adc4395a9621 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Sun, 15 Mar 2026 16:07:27 -0700
Subject: [PATCH] fix(avocet): _MODELS_DIR overridable in tests; sanitize score
 paths against path traversal

---
 app/api.py        | 19 +++++++++++++++----
 tests/test_api.py | 15 ++++++++-------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/app/api.py b/app/api.py
index d505bea..a437f2f 100644
--- a/app/api.py
+++ b/app/api.py
@@ -16,8 +16,9 @@ from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel
 
 _ROOT = Path(__file__).parent.parent
-_DATA_DIR: Path = _ROOT / "data"   # overridable in tests via set_data_dir()
-_CONFIG_DIR: Path | None = None    # None = use real path
+_DATA_DIR: Path = _ROOT / "data"     # overridable in tests via set_data_dir()
+_MODELS_DIR: Path = _ROOT / "models" # overridable in tests via set_models_dir()
+_CONFIG_DIR: Path | None = None      # None = use real path
 
 
 def set_data_dir(path: Path) -> None:
@@ -26,6 +27,12 @@ def set_data_dir(path: Path) -> None:
     _DATA_DIR = path
 
 
+def set_models_dir(path: Path) -> None:
+    """Override models directory — used by tests."""
+    global _MODELS_DIR
+    _MODELS_DIR = path
+
+
 def set_config_dir(path: Path | None) -> None:
     """Override config directory — used by tests."""
     global _CONFIG_DIR
@@ -347,7 +354,7 @@ def run_benchmark(include_slow: bool = False):
 @app.get("/api/finetune/status")
 def get_finetune_status():
     """Scan models/ for training_info.json files. Returns [] if none exist."""
-    models_dir = _ROOT / "models"
+    models_dir = _MODELS_DIR
     if not models_dir.exists():
         return []
     results = []
@@ -377,8 +384,12 @@ def run_finetune_endpoint(
     python_bin = "/devl/miniconda3/envs/job-seeker-classifiers/bin/python"
     script = str(_ROOT / "scripts" / "finetune_classifier.py")
     cmd = [python_bin, script, "--model", model, "--epochs", str(epochs)]
+    data_root = _DATA_DIR.resolve()
     for score_file in score:
-        cmd.extend(["--score", score_file])
+        resolved = (_DATA_DIR / score_file).resolve()
+        if not str(resolved).startswith(str(data_root)):
+            raise HTTPException(400, f"Invalid score path: {score_file!r}")
+        cmd.extend(["--score", str(resolved)])
 
     def generate():
         try:
diff --git a/tests/test_api.py b/tests/test_api.py
index acbc630..4f960b5 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -336,13 +336,13 @@ def test_finetune_status_returns_empty_when_no_models_dir(client):
     assert r.json() == []
 
 
-def test_finetune_status_returns_training_info(client):
+def test_finetune_status_returns_training_info(client, tmp_path):
     """GET /api/finetune/status must return one entry per training_info.json found."""
     import json as _json
     from app import api as api_module
 
-    models_dir = api_module._ROOT / "models" / "avocet-deberta-small-test"
-    models_dir.mkdir(parents=True, exist_ok=True)
+    models_dir = tmp_path / "models" / "avocet-deberta-small"
+    models_dir.mkdir(parents=True)
     info = {
         "name": "avocet-deberta-small",
         "base_model_id": "cross-encoder/nli-deberta-v3-small",
@@ -352,14 +352,14 @@ def test_finetune_status_returns_training_info(client):
     }
     (models_dir / "training_info.json").write_text(_json.dumps(info))
 
+    api_module.set_models_dir(tmp_path / "models")
     try:
         r = client.get("/api/finetune/status")
         assert r.status_code == 200
         data = r.json()
         assert any(d["name"] == "avocet-deberta-small" for d in data)
     finally:
-        import shutil
-        shutil.rmtree(models_dir)
+        api_module.set_models_dir(api_module._ROOT / "models")
 
 
 def test_finetune_run_streams_sse_events(client):
@@ -427,5 +427,6 @@ def test_finetune_run_passes_score_files_to_subprocess(client):
 
     assert "--score" in captured_cmd
     assert captured_cmd.count("--score") == 2
-    assert "run1.jsonl" in captured_cmd
-    assert "run2.jsonl" in captured_cmd
+    # Paths are resolved to absolute — check filenames are present as substrings
+    assert any("run1.jsonl" in arg for arg in captured_cmd)
+    assert any("run2.jsonl" in arg for arg in captured_cmd)