From fb8b464dd0199e1b8a0c7e60cdef888861ce9da9 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 15 Apr 2026 12:23:12 -0700 Subject: [PATCH] fix: use resume_parser extractors in import endpoint to clean CID glyphs The import endpoint was doing its own inline PDF/DOCX/ODT extraction without calling _clean_cid(). Bullet CIDs (127, 149, 183) and other ATS-reembedded font artifacts were stored raw, surfacing as (cid:127) in the resume library. Switch to extract_text_from_pdf/docx/odt from resume_parser.py which already handle two-column layouts and CID cleaning. --- dev-api.py | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/dev-api.py b/dev-api.py index b1ac500..a143af0 100644 --- a/dev-api.py +++ b/dev-api.py @@ -773,32 +773,17 @@ async def import_resume_endpoint(file: UploadFile, name: str = ""): text = content.decode("utf-8", errors="replace") elif ext in (".pdf", ".docx", ".odt"): - with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: - tmp.write(content) - tmp_path = tmp.name - try: - if ext == ".pdf": - import pdfplumber - with pdfplumber.open(tmp_path) as pdf: - text = "\n".join(p.extract_text() or "" for p in pdf.pages) - elif ext == ".docx": - from docx import Document - doc = Document(tmp_path) - text = "\n".join(p.text for p in doc.paragraphs) - else: - import zipfile - from xml.etree import ElementTree as ET - with zipfile.ZipFile(tmp_path) as z: - xml = z.read("content.xml") - ET_root = ET.fromstring(xml) - text = "\n".join( - el.text or "" - for el in ET_root.iter( - "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}p" - ) - ) - finally: - os.unlink(tmp_path) + from scripts.resume_parser import ( + extract_text_from_pdf as _extract_pdf, + extract_text_from_docx as _extract_docx, + extract_text_from_odt as _extract_odt, + ) + if ext == ".pdf": + text = _extract_pdf(content) + elif ext == ".docx": + text = _extract_docx(content) + else: + text = _extract_odt(content) elif ext in (".yaml", ".yml"): import yaml as _yaml