fix: use resume_parser extractors in import endpoint to clean CID glyphs

The import endpoint was doing its own inline PDF/DOCX/ODT extraction
without calling _clean_cid(). Bullet CIDs (127, 149, 183) and other
ATS-reembedded font artifacts were stored raw, surfacing as (cid:127)
in the resume library. Switch to extract_text_from_pdf/docx/odt from
resume_parser.py which already handle two-column layouts and CID cleaning.
This commit is contained in:
pyr0ball 2026-04-15 12:23:12 -07:00
parent ec521e14c5
commit fb8b464dd0

View file

@ -773,32 +773,17 @@ async def import_resume_endpoint(file: UploadFile, name: str = ""):
text = content.decode("utf-8", errors="replace") text = content.decode("utf-8", errors="replace")
elif ext in (".pdf", ".docx", ".odt"): elif ext in (".pdf", ".docx", ".odt"):
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: from scripts.resume_parser import (
tmp.write(content) extract_text_from_pdf as _extract_pdf,
tmp_path = tmp.name extract_text_from_docx as _extract_docx,
try: extract_text_from_odt as _extract_odt,
if ext == ".pdf": )
import pdfplumber if ext == ".pdf":
with pdfplumber.open(tmp_path) as pdf: text = _extract_pdf(content)
text = "\n".join(p.extract_text() or "" for p in pdf.pages) elif ext == ".docx":
elif ext == ".docx": text = _extract_docx(content)
from docx import Document else:
doc = Document(tmp_path) text = _extract_odt(content)
text = "\n".join(p.text for p in doc.paragraphs)
else:
import zipfile
from xml.etree import ElementTree as ET
with zipfile.ZipFile(tmp_path) as z:
xml = z.read("content.xml")
ET_root = ET.fromstring(xml)
text = "\n".join(
el.text or ""
for el in ET_root.iter(
"{urn:oasis:names:tc:opendocument:xmlns:text:1.0}p"
)
)
finally:
os.unlink(tmp_path)
elif ext in (".yaml", ".yml"): elif ext in (".yaml", ".yml"):
import yaml as _yaml import yaml as _yaml