"""Analyze rendered PDFs: structural checks and text extraction. Structural: pdftotext/pdfinfo via Poppler. Extraction: pdftotext (Poppler) and PyMuPDF — two independent tools that cover the standard extraction path used by most ATS platforms. """ import subprocess import sys from collections.abc import Callable from pathlib import Path import fitz # PyMuPDF from common import ( RENDERED_DIR, RESULTS_DIR, check_garbled, compute_accuracy, find_rendered_pdfs, find_yaml_for_pdf, get_expected_strings, normalize_quotes, write_json, ) from ruamel.yaml import YAML # --------------------------------------------------------------------------- # Extractors # --------------------------------------------------------------------------- def extract_pdftotext(pdf_path: Path) -> str: """Extract text using Poppler's pdftotext.""" result = subprocess.run( ["pdftotext", "-layout", str(pdf_path), "-"], capture_output=True, text=True, timeout=30, check=False, ) return result.stdout def extract_pymupdf(pdf_path: Path) -> str: """Extract text using PyMuPDF.""" doc = fitz.open(str(pdf_path)) text = "".join(page.get_text() for page in doc) doc.close() return text EXTRACTORS: dict[str, Callable[[Path], str]] = { "pdftotext": extract_pdftotext, "pymupdf": extract_pymupdf, } # --------------------------------------------------------------------------- # Structural checks # --------------------------------------------------------------------------- def check_poppler_installed() -> bool: """Check if pdftotext is available.""" try: subprocess.run( ["pdftotext", "-v"], capture_output=True, timeout=10, check=False ) return True except FileNotFoundError: return False def check_reading_order(extracted: str, name: str) -> dict: """Check that the CV name appears near the top of extracted text.""" lines = [line.strip() for line in extracted.split("\n") if line.strip()] result: dict = {"correct": True, "issues": []} if not name: return result for line in lines[:10]: if normalize_quotes(name) in normalize_quotes(line): return result result["correct"] = False result["issues"].append(f"Name not found in first 10 lines: {name}") return result # --------------------------------------------------------------------------- # Single-PDF analysis # --------------------------------------------------------------------------- def analyze_pdf(pdf_path: Path) -> dict: """Run all extractors and structural checks on a single PDF.""" yaml_path = find_yaml_for_pdf(pdf_path) expected = get_expected_strings(yaml_path) if yaml_path else [] # Get CV name for reading order check name = "" if yaml_path: yaml = YAML() with yaml_path.open(encoding="utf-8") as f: data = yaml.load(f) name = data.get("cv", {}).get("name", "") result: dict = { "pdf": str(pdf_path.relative_to(RENDERED_DIR)), "extractors": {}, "structural": { "text_extractable": False, "no_garbled": True, "reading_order": True, }, } for ext_name, extractor in EXTRACTORS.items(): try: text = extractor(pdf_path) accuracy = compute_accuracy(text, expected) garbled = check_garbled(text) result["extractors"][ext_name] = { "success": True, "text_length": len(text), "accuracy": accuracy["accuracy"], "fields_found": accuracy["found"], "fields_total": accuracy["total"], "missing": accuracy["missing"], "garbled": garbled, } # Use pdftotext for structural checks if ext_name == "pdftotext": result["structural"]["text_extractable"] = len(text.strip()) > 0 result["structural"]["no_garbled"] = len(garbled) == 0 reading = check_reading_order(text, name) result["structural"]["reading_order"] = reading["correct"] result["structural"]["reading_issues"] = reading["issues"] except Exception as e: result["extractors"][ext_name] = {"success": False, "error": str(e)} return result # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main() -> None: if not check_poppler_installed(): print("ERROR: poppler-utils not found.") # noqa: T201 print(" macOS: brew install poppler") # noqa: T201 print(" Ubuntu: apt install poppler-utils") # noqa: T201 sys.exit(1) pdfs = find_rendered_pdfs() if not pdfs: print("No PDFs found in rendered/. Run render_pdfs.py first.") # noqa: T201 sys.exit(1) print(f"Analyzing {len(pdfs)} PDFs with {len(EXTRACTORS)} extractors...") # noqa: T201 all_results: list[dict] = [] structural_pass = 0 extractor_stats: dict[str, dict] = { name: {"total": 0, "accuracy_sum": 0.0, "garbled": 0} for name in EXTRACTORS } for pdf_path in pdfs: result = analyze_pdf(pdf_path) all_results.append(result) # Structural pass/fail s = result["structural"] passed = s["text_extractable"] and s["no_garbled"] and s["reading_order"] if passed: structural_pass += 1 # Extractor stats statuses: list[str] = [] for ext_name, ext_result in result["extractors"].items(): if ext_result.get("success"): extractor_stats[ext_name]["total"] += 1 extractor_stats[ext_name]["accuracy_sum"] += ext_result["accuracy"] if ext_result.get("garbled"): extractor_stats[ext_name]["garbled"] += 1 statuses.append(f"{ext_name}={ext_result['accuracy']:.0%}") else: statuses.append(f"{ext_name}=FAIL") status = "PASS" if passed else "FAIL" print(f" {status}: {result['pdf']} ({', '.join(statuses)})") # noqa: T201 # Write structural results structural_summary = { "total": len(pdfs), "passed": structural_pass, "failed": len(pdfs) - structural_pass, "pass_rate": f"{structural_pass / len(pdfs) * 100:.1f}%", } write_json(RESULTS_DIR / "structural" / "structural_results.json", all_results) write_json( RESULTS_DIR / "structural" / "structural_summary.json", structural_summary ) # Write extraction summary extraction_summary: dict = {"total_pdfs": len(pdfs), "extractors": {}} for name, stats in extractor_stats.items(): total = stats["total"] avg = stats["accuracy_sum"] / total if total > 0 else 0 extraction_summary["extractors"][name] = { "pdfs_tested": total, "average_accuracy": f"{avg:.1%}", "garbled_count": stats["garbled"], } write_json(RESULTS_DIR / "opensource" / "extraction_results.json", all_results) write_json( RESULTS_DIR / "opensource" / "extraction_summary.json", extraction_summary ) # Print summary print( # noqa: T201 f"\nStructural: {structural_pass}/{len(pdfs)} passed ({structural_summary['pass_rate']})" ) for name, s in extraction_summary["extractors"].items(): print( # noqa: T201 f"Extraction ({name}): {s['average_accuracy']} avg accuracy, {s['garbled_count']} garbled" ) if structural_pass < len(pdfs): print( # noqa: T201 f"\nWARNING: {len(pdfs) - structural_pass} PDFs failed structural analysis." ) sys.exit(1) if __name__ == "__main__": main()