feat: wire fine-tune UI end-to-end + harden setup.sh

- setup.sh: replace docker-image-based NVIDIA test with nvidia-ctk validate (faster, no 100MB pull, no daemon required); add check_docker_running() to auto-start the Docker service on Linux or warn on macOS - prepare_training_data.py: also scan training_data/uploads/*.{md,txt} so web-uploaded letters are included in training data - task_runner.py: add prepare_training task type (calls build_records + write_jsonl inline; reports pair count in task result) - Settings fine-tune tab: Step 1 accepts .md/.txt uploads; Step 2 Extract button submits prepare_training background task + shows status; Step 3 shows make finetune command + live Ollama model status poller
2026-02-25 16:31:53 -08:00 · 2026-02-25 16:31:53 -08:00 · bcde4c960e
commit bcde4c960e
parent 740b0ea45a
4 changed files with 118 additions and 15 deletions
--- a/app/pages/2_Settings.py
+++ b/app/pages/2_Settings.py
@ -1026,9 +1026,10 @@ with tab_finetune:

        if ft_step == 1:
            st.markdown("**Step 1: Upload Cover Letters**")
+            st.caption("Accepted formats: `.md` or `.txt`. Convert PDFs to text before uploading.")
            uploaded = st.file_uploader(
-                "Upload cover letters (PDF, DOCX, or TXT)",
-                type=["pdf", "docx", "txt"],
+                "Upload cover letters (.md or .txt)",
+                type=["md", "txt"],
                accept_multiple_files=True,
            )
            if uploaded and st.button("Extract Training Pairs →", type="primary", key="ft_extract"):
@ -1040,18 +1041,45 @@ with tab_finetune:
                st.rerun()

        elif ft_step == 2:
-            st.markdown("**Step 2: Preview Training Pairs**")
-            st.info("Run `python scripts/prepare_training_data.py` to extract pairs, then return here.")
+            st.markdown("**Step 2: Extract Training Pairs**")
+            import json as _json
+            import sqlite3 as _sqlite3
+            from scripts.db import DEFAULT_DB as _FT_DB
+
            jsonl_path = _profile.docs_dir / "training_data" / "cover_letters.jsonl"
+
+            # Show task status
+            _ft_conn = _sqlite3.connect(_FT_DB)
+            _ft_conn.row_factory = _sqlite3.Row
+            _ft_task = _ft_conn.execute(
+                "SELECT * FROM background_tasks WHERE task_type='prepare_training' ORDER BY id DESC LIMIT 1"
+            ).fetchone()
+            _ft_conn.close()
+
+            if _ft_task:
+                _ft_status = _ft_task["status"]
+                if _ft_status == "completed":
+                    st.success(f"✅ {_ft_task['error'] or 'Extraction complete'}")
+                elif _ft_status in ("running", "queued"):
+                    st.info(f"⏳ {_ft_status.capitalize()}… refresh to check progress.")
+                elif _ft_status == "failed":
+                    st.error(f"Extraction failed: {_ft_task['error']}")
+
+            if st.button("⚙️ Extract Training Pairs", type="primary", key="ft_extract2"):
+                from scripts.task_runner import submit_task as _ft_submit
+                _ft_submit(_FT_DB, "prepare_training", 0)
+                st.info("Extracting in the background — refresh in a moment.")
+                st.rerun()
+
            if jsonl_path.exists():
-                import json as _json
                pairs = [_json.loads(l) for l in jsonl_path.read_text().splitlines() if l.strip()]
-                st.caption(f"{len(pairs)} training pairs extracted.")
+                st.caption(f"{len(pairs)} training pairs ready.")
                for i, p in enumerate(pairs[:3]):
                    with st.expander(f"Pair {i+1}"):
-                        st.text(p.get("input", "")[:300])
+                        st.text(p.get("output", p.get("input", ""))[:300])
            else:
-                st.warning("No training pairs found. Run `prepare_training_data.py` first.")
+                st.caption("No training pairs yet — click Extract above.")
+
            col_back, col_next = st.columns([1, 4])
            if col_back.button("← Back", key="ft_back2"):
                st.session_state.ft_step = 1
@ -1061,13 +1089,45 @@ with tab_finetune:
                st.rerun()

        elif ft_step == 3:
-            st.markdown("**Step 3: Train**")
-            st.slider("Epochs", 3, 20, 10, key="ft_epochs")
-            if st.button("🚀 Start Fine-Tune", type="primary", key="ft_start"):
-                st.info("Fine-tune queued as a background task. Check back in 30–60 minutes.")
-            if st.button("← Back", key="ft_back3"):
+            st.markdown("**Step 3: Fine-Tune**")
+
+            _ft_profile_name = ((_profile.name.split() or ["cover"])[0].lower()
+                                if _profile else "cover")
+            _ft_model_name = f"{_ft_profile_name}-cover-writer"
+
+            st.info(
+                "Run the command below from your terminal. Training takes 30–90 min on GPU "
+                "and registers the model automatically when complete."
+            )
+            st.code("make finetune PROFILE=single-gpu", language="bash")
+            st.caption(
+                f"Your model will appear as **{_ft_model_name}:latest** in Ollama. "
+                "Cover letter generation will use it automatically."
+            )
+
+            st.markdown("**Model status:**")
+            try:
+                import os as _os
+                import requests as _ft_req
+                _ollama_url = _os.environ.get("OLLAMA_URL", "http://localhost:11434")
+                _tags = _ft_req.get(f"{_ollama_url}/api/tags", timeout=3)
+                if _tags.status_code == 200:
+                    _model_names = [m["name"] for m in _tags.json().get("models", [])]
+                    if any(_ft_model_name in m for m in _model_names):
+                        st.success(f"✅ `{_ft_model_name}:latest` is ready in Ollama!")
+                    else:
+                        st.warning(f"⏳ `{_ft_model_name}:latest` not registered yet.")
+                else:
+                    st.caption("Ollama returned an unexpected response.")
+            except Exception:
+                st.caption("Could not reach Ollama — ensure services are running with `make start`.")
+
+            col_back, col_refresh = st.columns([1, 3])
+            if col_back.button("← Back", key="ft_back3"):
                st.session_state.ft_step = 2
                st.rerun()
+            if col_refresh.button("🔄 Check model status", key="ft_refresh3"):
+                st.rerun()

 # ── Developer tab ─────────────────────────────────────────────────────────────
 if _show_dev_tab:
--- a/scripts/prepare_training_data.py
+++ b/scripts/prepare_training_data.py
@ -81,6 +81,16 @@ def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]:
            if p not in seen:
                seen.add(p)
                all_paths.append(p)
+
+    # Also scan web-uploaded files (Settings → Fine-tune → Upload)
+    uploads_dir = letters_dir / "training_data" / "uploads"
+    if uploads_dir.exists():
+        for glob in ("*.md", "*.txt"):
+            for p in uploads_dir.glob(glob):
+                if p not in seen:
+                    seen.add(p)
+                    all_paths.append(p)
+
    for path in sorted(all_paths):
        text = path.read_text(encoding="utf-8", errors="ignore").strip()
        if not text or len(text) < 100:
--- a/scripts/task_runner.py
+++ b/scripts/task_runner.py
@ -243,6 +243,17 @@ def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int,
            )
            return

+        elif task_type == "prepare_training":
+            from scripts.prepare_training_data import build_records, write_jsonl, DEFAULT_OUTPUT
+            records = build_records()
+            write_jsonl(records, DEFAULT_OUTPUT)
+            n = len(records)
+            update_task_status(
+                db_path, task_id, "completed",
+                error=f"{n} training pair{'s' if n != 1 else ''} extracted",
+            )
+            return
+
        else:
            raise ValueError(f"Unknown task_type: {task_type!r}")

--- a/setup.sh
+++ b/setup.sh
@ -168,6 +168,27 @@ check_compose() {
    fi
 }

+# ── Docker daemon health check ──────────────────────────────────────────────────
+check_docker_running() {
+    if docker info &>/dev/null 2>&1; then
+        success "Docker daemon is running."
+        return
+    fi
+    warn "Docker daemon is not responding."
+    if [[ "$OS" == "Linux" ]] && command -v systemctl &>/dev/null; then
+        info "Starting Docker service…"
+        $SUDO systemctl start docker 2>/dev/null || true
+        sleep 2
+        if docker info &>/dev/null 2>&1; then
+            success "Docker daemon started."
+        else
+            warn "Docker failed to start. Run: sudo systemctl start docker"
+        fi
+    elif [[ "$OS" == "Darwin" ]]; then
+        warn "Docker Desktop is not running. Start it, wait for the whale icon, then run 'make start'."
+    fi
+}
+
 # ── NVIDIA Container Toolkit ───────────────────────────────────────────────────
 install_nvidia_toolkit() {
    [[ "$OS" != "Linux" ]] && return   # macOS has no NVIDIA support
@ -175,8 +196,8 @@ install_nvidia_toolkit() {
        info "No NVIDIA GPU detected — skipping Container Toolkit."
        return
    fi
-    if docker run --rm --gpus all nvidia/cuda:12.0-base-ubuntu22.04 nvidia-smi &>/dev/null 2>&1; then
-        success "NVIDIA Container Toolkit already working."
+    if cmd_exists nvidia-ctk && nvidia-ctk runtime validate --runtime=docker &>/dev/null 2>&1; then
+        success "NVIDIA Container Toolkit already configured."
        return
    fi
    info "NVIDIA GPU detected. Installing Container Toolkit…"
@ -283,6 +304,7 @@ main() {
    # Podman takes precedence if already installed; otherwise install Docker
    if ! check_podman; then
        install_docker
+        check_docker_running
        check_compose
        install_nvidia_toolkit
    fi