feat: wire fine-tune UI end-to-end + harden setup.sh

- setup.sh: replace docker-image-based NVIDIA test with nvidia-ctk validate
  (faster, no 100MB pull, no daemon required); add check_docker_running()
  to auto-start the Docker service on Linux or warn on macOS
- prepare_training_data.py: also scan training_data/uploads/*.{md,txt}
  so web-uploaded letters are included in training data
- task_runner.py: add prepare_training task type (calls build_records +
  write_jsonl inline; reports pair count in task result)
- Settings fine-tune tab: Step 1 accepts .md/.txt uploads; Step 2 Extract
  button submits prepare_training background task + shows status; Step 3
  shows make finetune command + live Ollama model status poller
This commit is contained in:
pyr0ball 2026-02-25 16:31:53 -08:00
parent 4d66c04d1e
commit dc4a08c063
4 changed files with 118 additions and 15 deletions

View file

@ -1026,9 +1026,10 @@ with tab_finetune:
if ft_step == 1: if ft_step == 1:
st.markdown("**Step 1: Upload Cover Letters**") st.markdown("**Step 1: Upload Cover Letters**")
st.caption("Accepted formats: `.md` or `.txt`. Convert PDFs to text before uploading.")
uploaded = st.file_uploader( uploaded = st.file_uploader(
"Upload cover letters (PDF, DOCX, or TXT)", "Upload cover letters (.md or .txt)",
type=["pdf", "docx", "txt"], type=["md", "txt"],
accept_multiple_files=True, accept_multiple_files=True,
) )
if uploaded and st.button("Extract Training Pairs →", type="primary", key="ft_extract"): if uploaded and st.button("Extract Training Pairs →", type="primary", key="ft_extract"):
@ -1040,18 +1041,45 @@ with tab_finetune:
st.rerun() st.rerun()
elif ft_step == 2: elif ft_step == 2:
st.markdown("**Step 2: Preview Training Pairs**") st.markdown("**Step 2: Extract Training Pairs**")
st.info("Run `python scripts/prepare_training_data.py` to extract pairs, then return here.")
jsonl_path = _profile.docs_dir / "training_data" / "cover_letters.jsonl"
if jsonl_path.exists():
import json as _json import json as _json
import sqlite3 as _sqlite3
from scripts.db import DEFAULT_DB as _FT_DB
jsonl_path = _profile.docs_dir / "training_data" / "cover_letters.jsonl"
# Show task status
_ft_conn = _sqlite3.connect(_FT_DB)
_ft_conn.row_factory = _sqlite3.Row
_ft_task = _ft_conn.execute(
"SELECT * FROM background_tasks WHERE task_type='prepare_training' ORDER BY id DESC LIMIT 1"
).fetchone()
_ft_conn.close()
if _ft_task:
_ft_status = _ft_task["status"]
if _ft_status == "completed":
st.success(f"{_ft_task['error'] or 'Extraction complete'}")
elif _ft_status in ("running", "queued"):
st.info(f"{_ft_status.capitalize()}… refresh to check progress.")
elif _ft_status == "failed":
st.error(f"Extraction failed: {_ft_task['error']}")
if st.button("⚙️ Extract Training Pairs", type="primary", key="ft_extract2"):
from scripts.task_runner import submit_task as _ft_submit
_ft_submit(_FT_DB, "prepare_training", 0)
st.info("Extracting in the background — refresh in a moment.")
st.rerun()
if jsonl_path.exists():
pairs = [_json.loads(l) for l in jsonl_path.read_text().splitlines() if l.strip()] pairs = [_json.loads(l) for l in jsonl_path.read_text().splitlines() if l.strip()]
st.caption(f"{len(pairs)} training pairs extracted.") st.caption(f"{len(pairs)} training pairs ready.")
for i, p in enumerate(pairs[:3]): for i, p in enumerate(pairs[:3]):
with st.expander(f"Pair {i+1}"): with st.expander(f"Pair {i+1}"):
st.text(p.get("input", "")[:300]) st.text(p.get("output", p.get("input", ""))[:300])
else: else:
st.warning("No training pairs found. Run `prepare_training_data.py` first.") st.caption("No training pairs yet — click Extract above.")
col_back, col_next = st.columns([1, 4]) col_back, col_next = st.columns([1, 4])
if col_back.button("← Back", key="ft_back2"): if col_back.button("← Back", key="ft_back2"):
st.session_state.ft_step = 1 st.session_state.ft_step = 1
@ -1061,13 +1089,45 @@ with tab_finetune:
st.rerun() st.rerun()
elif ft_step == 3: elif ft_step == 3:
st.markdown("**Step 3: Train**") st.markdown("**Step 3: Fine-Tune**")
st.slider("Epochs", 3, 20, 10, key="ft_epochs")
if st.button("🚀 Start Fine-Tune", type="primary", key="ft_start"): _ft_profile_name = ((_profile.name.split() or ["cover"])[0].lower()
st.info("Fine-tune queued as a background task. Check back in 3060 minutes.") if _profile else "cover")
if st.button("← Back", key="ft_back3"): _ft_model_name = f"{_ft_profile_name}-cover-writer"
st.info(
"Run the command below from your terminal. Training takes 3090 min on GPU "
"and registers the model automatically when complete."
)
st.code("make finetune PROFILE=single-gpu", language="bash")
st.caption(
f"Your model will appear as **{_ft_model_name}:latest** in Ollama. "
"Cover letter generation will use it automatically."
)
st.markdown("**Model status:**")
try:
import os as _os
import requests as _ft_req
_ollama_url = _os.environ.get("OLLAMA_URL", "http://localhost:11434")
_tags = _ft_req.get(f"{_ollama_url}/api/tags", timeout=3)
if _tags.status_code == 200:
_model_names = [m["name"] for m in _tags.json().get("models", [])]
if any(_ft_model_name in m for m in _model_names):
st.success(f"✅ `{_ft_model_name}:latest` is ready in Ollama!")
else:
st.warning(f"⏳ `{_ft_model_name}:latest` not registered yet.")
else:
st.caption("Ollama returned an unexpected response.")
except Exception:
st.caption("Could not reach Ollama — ensure services are running with `make start`.")
col_back, col_refresh = st.columns([1, 3])
if col_back.button("← Back", key="ft_back3"):
st.session_state.ft_step = 2 st.session_state.ft_step = 2
st.rerun() st.rerun()
if col_refresh.button("🔄 Check model status", key="ft_refresh3"):
st.rerun()
# ── Developer tab ───────────────────────────────────────────────────────────── # ── Developer tab ─────────────────────────────────────────────────────────────
if _show_dev_tab: if _show_dev_tab:

View file

@ -81,6 +81,16 @@ def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]:
if p not in seen: if p not in seen:
seen.add(p) seen.add(p)
all_paths.append(p) all_paths.append(p)
# Also scan web-uploaded files (Settings → Fine-tune → Upload)
uploads_dir = letters_dir / "training_data" / "uploads"
if uploads_dir.exists():
for glob in ("*.md", "*.txt"):
for p in uploads_dir.glob(glob):
if p not in seen:
seen.add(p)
all_paths.append(p)
for path in sorted(all_paths): for path in sorted(all_paths):
text = path.read_text(encoding="utf-8", errors="ignore").strip() text = path.read_text(encoding="utf-8", errors="ignore").strip()
if not text or len(text) < 100: if not text or len(text) < 100:

View file

@ -243,6 +243,17 @@ def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int,
) )
return return
elif task_type == "prepare_training":
from scripts.prepare_training_data import build_records, write_jsonl, DEFAULT_OUTPUT
records = build_records()
write_jsonl(records, DEFAULT_OUTPUT)
n = len(records)
update_task_status(
db_path, task_id, "completed",
error=f"{n} training pair{'s' if n != 1 else ''} extracted",
)
return
else: else:
raise ValueError(f"Unknown task_type: {task_type!r}") raise ValueError(f"Unknown task_type: {task_type!r}")

View file

@ -168,6 +168,27 @@ check_compose() {
fi fi
} }
# ── Docker daemon health check ──────────────────────────────────────────────────
check_docker_running() {
if docker info &>/dev/null 2>&1; then
success "Docker daemon is running."
return
fi
warn "Docker daemon is not responding."
if [[ "$OS" == "Linux" ]] && command -v systemctl &>/dev/null; then
info "Starting Docker service…"
$SUDO systemctl start docker 2>/dev/null || true
sleep 2
if docker info &>/dev/null 2>&1; then
success "Docker daemon started."
else
warn "Docker failed to start. Run: sudo systemctl start docker"
fi
elif [[ "$OS" == "Darwin" ]]; then
warn "Docker Desktop is not running. Start it, wait for the whale icon, then run 'make start'."
fi
}
# ── NVIDIA Container Toolkit ─────────────────────────────────────────────────── # ── NVIDIA Container Toolkit ───────────────────────────────────────────────────
install_nvidia_toolkit() { install_nvidia_toolkit() {
[[ "$OS" != "Linux" ]] && return # macOS has no NVIDIA support [[ "$OS" != "Linux" ]] && return # macOS has no NVIDIA support
@ -175,8 +196,8 @@ install_nvidia_toolkit() {
info "No NVIDIA GPU detected — skipping Container Toolkit." info "No NVIDIA GPU detected — skipping Container Toolkit."
return return
fi fi
if docker run --rm --gpus all nvidia/cuda:12.0-base-ubuntu22.04 nvidia-smi &>/dev/null 2>&1; then if cmd_exists nvidia-ctk && nvidia-ctk runtime validate --runtime=docker &>/dev/null 2>&1; then
success "NVIDIA Container Toolkit already working." success "NVIDIA Container Toolkit already configured."
return return
fi fi
info "NVIDIA GPU detected. Installing Container Toolkit…" info "NVIDIA GPU detected. Installing Container Toolkit…"
@ -283,6 +304,7 @@ main() {
# Podman takes precedence if already installed; otherwise install Docker # Podman takes precedence if already installed; otherwise install Docker
if ! check_podman; then if ! check_podman; then
install_docker install_docker
check_docker_running
check_compose check_compose
install_nvidia_toolkit install_nvidia_toolkit
fi fi