feat: wire fine-tune UI end-to-end + harden setup.sh

- setup.sh: replace docker-image-based NVIDIA test with nvidia-ctk validate (faster, no 100MB pull, no daemon required); add check_docker_running() to auto-start the Docker service on Linux or warn on macOS - prepare_training_data.py: also scan training_data/uploads/*.{md,txt} so web-uploaded letters are included in training data - task_runner.py: add prepare_training task type (calls build_records + write_jsonl inline; reports pair count in task result) - Settings fine-tune tab: Step 1 accepts .md/.txt uploads; Step 2 Extract button submits prepare_training background task + shows status; Step 3 shows make finetune command + live Ollama model status poller
2026-02-25 16:31:53 -08:00 · 2026-02-25 16:31:53 -08:00 · dc4a08c063
commit dc4a08c063
parent 4d66c04d1e
4 changed files with 118 additions and 15 deletions
--- a/app/pages/2_Settings.py
+++ b/app/pages/2_Settings.py
@ -1026,9 +1026,10 @@ with tab_finetune:
        if ft_step == 1:
            st.markdown("**Step 1: Upload Cover Letters**")
            st.caption("Accepted formats: `.md` or `.txt`. Convert PDFs to text before uploading.")
            uploaded = st.file_uploader(
-                "Upload cover letters (PDF, DOCX, or TXT)",
+                "Upload cover letters (.md or .txt)",
-                type=["pdf", "docx", "txt"],
+                type=["md", "txt"],
                accept_multiple_files=True,
            )
            if uploaded and st.button("Extract Training Pairs →", type="primary", key="ft_extract"):
@ -1040,18 +1041,45 @@ with tab_finetune:
                st.rerun()
        elif ft_step == 2:
-            st.markdown("**Step 2: Preview Training Pairs**")
+            st.markdown("**Step 2: Extract Training Pairs**")
            st.info("Run `python scripts/prepare_training_data.py` to extract pairs, then return here.")
            jsonl_path = _profile.docs_dir / "training_data" / "cover_letters.jsonl"
            if jsonl_path.exists():
            import json as _json
            import sqlite3 as _sqlite3
            from scripts.db import DEFAULT_DB as _FT_DB
            jsonl_path = _profile.docs_dir / "training_data" / "cover_letters.jsonl"
            # Show task status
            _ft_conn = _sqlite3.connect(_FT_DB)
            _ft_conn.row_factory = _sqlite3.Row
            _ft_task = _ft_conn.execute(
                "SELECT * FROM background_tasks WHERE task_type='prepare_training' ORDER BY id DESC LIMIT 1"
            ).fetchone()
            _ft_conn.close()
            if _ft_task:
                _ft_status = _ft_task["status"]
                if _ft_status == "completed":
                    st.success(f"✅ {_ft_task['error'] or 'Extraction complete'}")
                elif _ft_status in ("running", "queued"):
                    st.info(f"⏳ {_ft_status.capitalize()}… refresh to check progress.")
                elif _ft_status == "failed":
                    st.error(f"Extraction failed: {_ft_task['error']}")
            if st.button("⚙️ Extract Training Pairs", type="primary", key="ft_extract2"):
                from scripts.task_runner import submit_task as _ft_submit
                _ft_submit(_FT_DB, "prepare_training", 0)
                st.info("Extracting in the background — refresh in a moment.")
                st.rerun()
            if jsonl_path.exists():
                pairs = [_json.loads(l) for l in jsonl_path.read_text().splitlines() if l.strip()]
-                st.caption(f"{len(pairs)} training pairs extracted.")
+                st.caption(f"{len(pairs)} training pairs ready.")
                for i, p in enumerate(pairs[:3]):
                    with st.expander(f"Pair {i+1}"):
-                        st.text(p.get("input", "")[:300])
+                        st.text(p.get("output", p.get("input", ""))[:300])
            else:
-                st.warning("No training pairs found. Run `prepare_training_data.py` first.")
+                st.caption("No training pairs yet — click Extract above.")
            col_back, col_next = st.columns([1, 4])
            if col_back.button("← Back", key="ft_back2"):
                st.session_state.ft_step = 1
@ -1061,13 +1089,45 @@ with tab_finetune:
                st.rerun()
        elif ft_step == 3:
-            st.markdown("**Step 3: Train**")
+            st.markdown("**Step 3: Fine-Tune**")
-            st.slider("Epochs", 3, 20, 10, key="ft_epochs")
+
-            if st.button("🚀 Start Fine-Tune", type="primary", key="ft_start"):
+            _ft_profile_name = ((_profile.name.split() or ["cover"])[0].lower()
-                st.info("Fine-tune queued as a background task. Check back in 30–60 minutes.")
+                                if _profile else "cover")
-            if st.button("← Back", key="ft_back3"):
+            _ft_model_name = f"{_ft_profile_name}-cover-writer"
            st.info(
                "Run the command below from your terminal. Training takes 30–90 min on GPU "
                "and registers the model automatically when complete."
            )
            st.code("make finetune PROFILE=single-gpu", language="bash")
            st.caption(
                f"Your model will appear as **{_ft_model_name}:latest** in Ollama. "
                "Cover letter generation will use it automatically."
            )
            st.markdown("**Model status:**")
            try:
                import os as _os
                import requests as _ft_req
                _ollama_url = _os.environ.get("OLLAMA_URL", "http://localhost:11434")
                _tags = _ft_req.get(f"{_ollama_url}/api/tags", timeout=3)
                if _tags.status_code == 200:
                    _model_names = [m["name"] for m in _tags.json().get("models", [])]
                    if any(_ft_model_name in m for m in _model_names):
                        st.success(f"✅ `{_ft_model_name}:latest` is ready in Ollama!")
                    else:
                        st.warning(f"⏳ `{_ft_model_name}:latest` not registered yet.")
                else:
                    st.caption("Ollama returned an unexpected response.")
            except Exception:
                st.caption("Could not reach Ollama — ensure services are running with `make start`.")
            col_back, col_refresh = st.columns([1, 3])
            if col_back.button("← Back", key="ft_back3"):
                st.session_state.ft_step = 2
                st.rerun()
            if col_refresh.button("🔄 Check model status", key="ft_refresh3"):
                st.rerun()
 # ── Developer tab ─────────────────────────────────────────────────────────────
 if _show_dev_tab:
--- a/scripts/prepare_training_data.py
+++ b/scripts/prepare_training_data.py
@ -81,6 +81,16 @@ def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]:
            if p not in seen:
                seen.add(p)
                all_paths.append(p)
    # Also scan web-uploaded files (Settings → Fine-tune → Upload)
    uploads_dir = letters_dir / "training_data" / "uploads"
    if uploads_dir.exists():
        for glob in ("*.md", "*.txt"):
            for p in uploads_dir.glob(glob):
                if p not in seen:
                    seen.add(p)
                    all_paths.append(p)
    for path in sorted(all_paths):
        text = path.read_text(encoding="utf-8", errors="ignore").strip()
        if not text or len(text) < 100:
--- a/scripts/task_runner.py
+++ b/scripts/task_runner.py
@ -243,6 +243,17 @@ def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int,
            )
            return
        elif task_type == "prepare_training":
            from scripts.prepare_training_data import build_records, write_jsonl, DEFAULT_OUTPUT
            records = build_records()
            write_jsonl(records, DEFAULT_OUTPUT)
            n = len(records)
            update_task_status(
                db_path, task_id, "completed",
                error=f"{n} training pair{'s' if n != 1 else ''} extracted",
            )
            return
        else:
            raise ValueError(f"Unknown task_type: {task_type!r}")
--- a/setup.sh
+++ b/setup.sh
@ -168,6 +168,27 @@ check_compose() {
    fi
 }
 # ── Docker daemon health check ──────────────────────────────────────────────────
 check_docker_running() {
    if docker info &>/dev/null 2>&1; then
        success "Docker daemon is running."
        return
    fi
    warn "Docker daemon is not responding."
    if [[ "$OS" == "Linux" ]] && command -v systemctl &>/dev/null; then
        info "Starting Docker service…"
        $SUDO systemctl start docker 2>/dev/null || true
        sleep 2
        if docker info &>/dev/null 2>&1; then
            success "Docker daemon started."
        else
            warn "Docker failed to start. Run: sudo systemctl start docker"
        fi
    elif [[ "$OS" == "Darwin" ]]; then
        warn "Docker Desktop is not running. Start it, wait for the whale icon, then run 'make start'."
    fi
 }
 # ── NVIDIA Container Toolkit ───────────────────────────────────────────────────
 install_nvidia_toolkit() {
    [[ "$OS" != "Linux" ]] && return   # macOS has no NVIDIA support
@ -175,8 +196,8 @@ install_nvidia_toolkit() {
        info "No NVIDIA GPU detected — skipping Container Toolkit."
        return
    fi
-    if docker run --rm --gpus all nvidia/cuda:12.0-base-ubuntu22.04 nvidia-smi &>/dev/null 2>&1; then
+    if cmd_exists nvidia-ctk && nvidia-ctk runtime validate --runtime=docker &>/dev/null 2>&1; then
-        success "NVIDIA Container Toolkit already working."
+        success "NVIDIA Container Toolkit already configured."
        return
    fi
    info "NVIDIA GPU detected. Installing Container Toolkit…"
@ -283,6 +304,7 @@ main() {
    # Podman takes precedence if already installed; otherwise install Docker
    if ! check_podman; then
        install_docker
        check_docker_running
        check_compose
        install_nvidia_toolkit
    fi