mirror of
https://github.com/rendercv/rendercv.git
synced 2026-04-18 05:52:54 -04:00
244 lines
7.8 KiB
Python
244 lines
7.8 KiB
Python
"""Analyze rendered PDFs: structural checks and text extraction.
|
|
|
|
Structural: pdftotext/pdfinfo via Poppler.
|
|
Extraction: pdftotext (Poppler) and PyMuPDF — two independent tools that
|
|
cover the standard extraction path used by most ATS platforms.
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
from collections.abc import Callable
|
|
from pathlib import Path
|
|
|
|
import fitz # PyMuPDF
|
|
from common import (
|
|
RENDERED_DIR,
|
|
RESULTS_DIR,
|
|
check_garbled,
|
|
compute_accuracy,
|
|
find_rendered_pdfs,
|
|
find_yaml_for_pdf,
|
|
get_expected_strings,
|
|
normalize_quotes,
|
|
write_json,
|
|
)
|
|
from ruamel.yaml import YAML
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Extractors
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def extract_pdftotext(pdf_path: Path) -> str:
|
|
"""Extract text using Poppler's pdftotext."""
|
|
result = subprocess.run(
|
|
["pdftotext", "-layout", str(pdf_path), "-"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
check=False,
|
|
)
|
|
return result.stdout
|
|
|
|
|
|
def extract_pymupdf(pdf_path: Path) -> str:
|
|
"""Extract text using PyMuPDF."""
|
|
doc = fitz.open(str(pdf_path))
|
|
text = "".join(page.get_text() for page in doc)
|
|
doc.close()
|
|
return text
|
|
|
|
|
|
EXTRACTORS: dict[str, Callable[[Path], str]] = {
|
|
"pdftotext": extract_pdftotext,
|
|
"pymupdf": extract_pymupdf,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Structural checks
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def check_poppler_installed() -> bool:
|
|
"""Check if pdftotext is available."""
|
|
try:
|
|
subprocess.run(
|
|
["pdftotext", "-v"], capture_output=True, timeout=10, check=False
|
|
)
|
|
return True
|
|
except FileNotFoundError:
|
|
return False
|
|
|
|
|
|
def check_reading_order(extracted: str, name: str) -> dict:
|
|
"""Check that the CV name appears near the top of extracted text."""
|
|
lines = [line.strip() for line in extracted.split("\n") if line.strip()]
|
|
result: dict = {"correct": True, "issues": []}
|
|
|
|
if not name:
|
|
return result
|
|
|
|
for line in lines[:10]:
|
|
if normalize_quotes(name) in normalize_quotes(line):
|
|
return result
|
|
|
|
result["correct"] = False
|
|
result["issues"].append(f"Name not found in first 10 lines: {name}")
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Single-PDF analysis
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def analyze_pdf(pdf_path: Path) -> dict:
|
|
"""Run all extractors and structural checks on a single PDF."""
|
|
yaml_path = find_yaml_for_pdf(pdf_path)
|
|
expected = get_expected_strings(yaml_path) if yaml_path else []
|
|
|
|
# Get CV name for reading order check
|
|
name = ""
|
|
if yaml_path:
|
|
yaml = YAML()
|
|
with yaml_path.open(encoding="utf-8") as f:
|
|
data = yaml.load(f)
|
|
name = data.get("cv", {}).get("name", "")
|
|
|
|
result: dict = {
|
|
"pdf": str(pdf_path.relative_to(RENDERED_DIR)),
|
|
"extractors": {},
|
|
"structural": {
|
|
"text_extractable": False,
|
|
"no_garbled": True,
|
|
"reading_order": True,
|
|
},
|
|
}
|
|
|
|
for ext_name, extractor in EXTRACTORS.items():
|
|
try:
|
|
text = extractor(pdf_path)
|
|
accuracy = compute_accuracy(text, expected)
|
|
garbled = check_garbled(text)
|
|
|
|
result["extractors"][ext_name] = {
|
|
"success": True,
|
|
"text_length": len(text),
|
|
"accuracy": accuracy["accuracy"],
|
|
"fields_found": accuracy["found"],
|
|
"fields_total": accuracy["total"],
|
|
"missing": accuracy["missing"],
|
|
"garbled": garbled,
|
|
}
|
|
|
|
# Use pdftotext for structural checks
|
|
if ext_name == "pdftotext":
|
|
result["structural"]["text_extractable"] = len(text.strip()) > 0
|
|
result["structural"]["no_garbled"] = len(garbled) == 0
|
|
reading = check_reading_order(text, name)
|
|
result["structural"]["reading_order"] = reading["correct"]
|
|
result["structural"]["reading_issues"] = reading["issues"]
|
|
|
|
except Exception as e:
|
|
result["extractors"][ext_name] = {"success": False, "error": str(e)}
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main() -> None:
|
|
if not check_poppler_installed():
|
|
print("ERROR: poppler-utils not found.") # noqa: T201
|
|
print(" macOS: brew install poppler") # noqa: T201
|
|
print(" Ubuntu: apt install poppler-utils") # noqa: T201
|
|
sys.exit(1)
|
|
|
|
pdfs = find_rendered_pdfs()
|
|
if not pdfs:
|
|
print("No PDFs found in rendered/. Run render_pdfs.py first.") # noqa: T201
|
|
sys.exit(1)
|
|
|
|
print(f"Analyzing {len(pdfs)} PDFs with {len(EXTRACTORS)} extractors...") # noqa: T201
|
|
|
|
all_results: list[dict] = []
|
|
structural_pass = 0
|
|
extractor_stats: dict[str, dict] = {
|
|
name: {"total": 0, "accuracy_sum": 0.0, "garbled": 0} for name in EXTRACTORS
|
|
}
|
|
|
|
for pdf_path in pdfs:
|
|
result = analyze_pdf(pdf_path)
|
|
all_results.append(result)
|
|
|
|
# Structural pass/fail
|
|
s = result["structural"]
|
|
passed = s["text_extractable"] and s["no_garbled"] and s["reading_order"]
|
|
if passed:
|
|
structural_pass += 1
|
|
|
|
# Extractor stats
|
|
statuses: list[str] = []
|
|
for ext_name, ext_result in result["extractors"].items():
|
|
if ext_result.get("success"):
|
|
extractor_stats[ext_name]["total"] += 1
|
|
extractor_stats[ext_name]["accuracy_sum"] += ext_result["accuracy"]
|
|
if ext_result.get("garbled"):
|
|
extractor_stats[ext_name]["garbled"] += 1
|
|
statuses.append(f"{ext_name}={ext_result['accuracy']:.0%}")
|
|
else:
|
|
statuses.append(f"{ext_name}=FAIL")
|
|
|
|
status = "PASS" if passed else "FAIL"
|
|
print(f" {status}: {result['pdf']} ({', '.join(statuses)})") # noqa: T201
|
|
|
|
# Write structural results
|
|
structural_summary = {
|
|
"total": len(pdfs),
|
|
"passed": structural_pass,
|
|
"failed": len(pdfs) - structural_pass,
|
|
"pass_rate": f"{structural_pass / len(pdfs) * 100:.1f}%",
|
|
}
|
|
write_json(RESULTS_DIR / "structural" / "structural_results.json", all_results)
|
|
write_json(
|
|
RESULTS_DIR / "structural" / "structural_summary.json", structural_summary
|
|
)
|
|
|
|
# Write extraction summary
|
|
extraction_summary: dict = {"total_pdfs": len(pdfs), "extractors": {}}
|
|
for name, stats in extractor_stats.items():
|
|
total = stats["total"]
|
|
avg = stats["accuracy_sum"] / total if total > 0 else 0
|
|
extraction_summary["extractors"][name] = {
|
|
"pdfs_tested": total,
|
|
"average_accuracy": f"{avg:.1%}",
|
|
"garbled_count": stats["garbled"],
|
|
}
|
|
write_json(RESULTS_DIR / "opensource" / "extraction_results.json", all_results)
|
|
write_json(
|
|
RESULTS_DIR / "opensource" / "extraction_summary.json", extraction_summary
|
|
)
|
|
|
|
# Print summary
|
|
print( # noqa: T201
|
|
f"\nStructural: {structural_pass}/{len(pdfs)} passed ({structural_summary['pass_rate']})"
|
|
)
|
|
for name, s in extraction_summary["extractors"].items():
|
|
print( # noqa: T201
|
|
f"Extraction ({name}): {s['average_accuracy']} avg accuracy, {s['garbled_count']} garbled"
|
|
)
|
|
|
|
if structural_pass < len(pdfs):
|
|
print( # noqa: T201
|
|
f"\nWARNING: {len(pdfs) - structural_pass} PDFs failed structural analysis."
|
|
)
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|