fix: use resume_parser extractors in import endpoint to clean CID glyphs
The import endpoint was doing its own inline PDF/DOCX/ODT extraction without calling _clean_cid(). Bullet CIDs (127, 149, 183) and other ATS-reembedded font artifacts were stored raw, surfacing as (cid:127) in the resume library. Switch to extract_text_from_pdf/docx/odt from resume_parser.py which already handle two-column layouts and CID cleaning.
This commit is contained in:
parent
ec521e14c5
commit
fb8b464dd0
1 changed files with 11 additions and 26 deletions
37
dev-api.py
37
dev-api.py
|
|
@ -773,32 +773,17 @@ async def import_resume_endpoint(file: UploadFile, name: str = ""):
|
||||||
text = content.decode("utf-8", errors="replace")
|
text = content.decode("utf-8", errors="replace")
|
||||||
|
|
||||||
elif ext in (".pdf", ".docx", ".odt"):
|
elif ext in (".pdf", ".docx", ".odt"):
|
||||||
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
from scripts.resume_parser import (
|
||||||
tmp.write(content)
|
extract_text_from_pdf as _extract_pdf,
|
||||||
tmp_path = tmp.name
|
extract_text_from_docx as _extract_docx,
|
||||||
try:
|
extract_text_from_odt as _extract_odt,
|
||||||
if ext == ".pdf":
|
)
|
||||||
import pdfplumber
|
if ext == ".pdf":
|
||||||
with pdfplumber.open(tmp_path) as pdf:
|
text = _extract_pdf(content)
|
||||||
text = "\n".join(p.extract_text() or "" for p in pdf.pages)
|
elif ext == ".docx":
|
||||||
elif ext == ".docx":
|
text = _extract_docx(content)
|
||||||
from docx import Document
|
else:
|
||||||
doc = Document(tmp_path)
|
text = _extract_odt(content)
|
||||||
text = "\n".join(p.text for p in doc.paragraphs)
|
|
||||||
else:
|
|
||||||
import zipfile
|
|
||||||
from xml.etree import ElementTree as ET
|
|
||||||
with zipfile.ZipFile(tmp_path) as z:
|
|
||||||
xml = z.read("content.xml")
|
|
||||||
ET_root = ET.fromstring(xml)
|
|
||||||
text = "\n".join(
|
|
||||||
el.text or ""
|
|
||||||
for el in ET_root.iter(
|
|
||||||
"{urn:oasis:names:tc:opendocument:xmlns:text:1.0}p"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
finally:
|
|
||||||
os.unlink(tmp_path)
|
|
||||||
|
|
||||||
elif ext in (".yaml", ".yml"):
|
elif ext in (".yaml", ".yml"):
|
||||||
import yaml as _yaml
|
import yaml as _yaml
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue