App: Peregrine Company: Circuit Forge LLC Source: github.com/pyr0ball/job-seeker (personal fork, not linked)
98 lines
2.7 KiB
Python
98 lines
2.7 KiB
Python
"""
|
|
Vision service — moondream2 inference for survey screenshot analysis.
|
|
|
|
Start: bash scripts/manage-vision.sh start
|
|
Or directly: conda run -n job-seeker-vision uvicorn scripts.vision_service.main:app --port 8002
|
|
|
|
First run downloads moondream2 from HuggingFace (~1.8GB).
|
|
Model is loaded lazily on first /analyze request and stays resident.
|
|
GPU is used if available (CUDA); falls back to CPU.
|
|
4-bit quantization on GPU keeps VRAM footprint ~1.5GB.
|
|
"""
|
|
import base64
|
|
import io
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
from pydantic import BaseModel
|
|
|
|
app = FastAPI(title="Job Seeker Vision Service")
|
|
|
|
# Module-level model state — lazy loaded on first /analyze request
|
|
_model = None
|
|
_tokenizer = None
|
|
_device = "cpu"
|
|
_loading = False
|
|
|
|
|
|
def _load_model() -> None:
|
|
global _model, _tokenizer, _device, _loading
|
|
if _model is not None:
|
|
return
|
|
_loading = True
|
|
print("[vision] Loading moondream2…")
|
|
import torch
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
model_id = "vikhyatk/moondream2"
|
|
revision = "2025-01-09"
|
|
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
if _device == "cuda":
|
|
from transformers import BitsAndBytesConfig
|
|
bnb = BitsAndBytesConfig(load_in_4bit=True)
|
|
_model = AutoModelForCausalLM.from_pretrained(
|
|
model_id, revision=revision,
|
|
quantization_config=bnb,
|
|
trust_remote_code=True,
|
|
device_map="auto",
|
|
)
|
|
else:
|
|
_model = AutoModelForCausalLM.from_pretrained(
|
|
model_id, revision=revision,
|
|
trust_remote_code=True,
|
|
)
|
|
_model.to(_device)
|
|
|
|
_tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
|
|
_loading = False
|
|
print(f"[vision] moondream2 ready on {_device}")
|
|
|
|
|
|
class AnalyzeRequest(BaseModel):
|
|
prompt: str
|
|
image_base64: str
|
|
|
|
|
|
class AnalyzeResponse(BaseModel):
|
|
text: str
|
|
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
import torch
|
|
return {
|
|
"status": "loading" if _loading else "ok",
|
|
"model": "moondream2",
|
|
"gpu": torch.cuda.is_available(),
|
|
"loaded": _model is not None,
|
|
}
|
|
|
|
|
|
@app.post("/analyze", response_model=AnalyzeResponse)
|
|
def analyze(req: AnalyzeRequest):
|
|
from PIL import Image
|
|
import torch
|
|
|
|
_load_model()
|
|
|
|
try:
|
|
image_data = base64.b64decode(req.image_base64)
|
|
image = Image.open(io.BytesIO(image_data)).convert("RGB")
|
|
except Exception as e:
|
|
raise HTTPException(status_code=400, detail=f"Invalid image: {e}")
|
|
|
|
with torch.no_grad():
|
|
enc_image = _model.encode_image(image)
|
|
answer = _model.answer_question(enc_image, req.prompt, _tokenizer)
|
|
|
|
return AnalyzeResponse(text=answer)
|