diff --git a/.gitignore b/.gitignore index 631260b1..eaf41ca7 100644 --- a/.gitignore +++ b/.gitignore @@ -193,3 +193,8 @@ coverage.md # MkDocs Material: mkdocs-material/ + +# ATS proof generated artifacts (reproducible via scripts/ats_proof/run_all.py): +scripts/ats_proof/rendered/ +scripts/ats_proof/results/ +scripts/ats_proof/analysis/ diff --git a/docs/ats_compatibility.md b/docs/ats_compatibility.md new file mode 100644 index 00000000..484e60de --- /dev/null +++ b/docs/ats_compatibility.md @@ -0,0 +1,125 @@ +--- +hide: + - navigation +--- + +# RenderCV ATS Compatibility Report + +## Summary + +We empirically tested whether RenderCV's PDF output can be correctly parsed by Applicant Tracking Systems. We rendered 4 test resumes across 5 themes (20 PDFs total), then ran each PDF through two independent text extraction tools and three commercial resume parsing engines. + +**Every PDF was correctly parsed.** All 20 PDFs passed structural analysis with zero garbled characters. Three commercial parsers (Affinda, Extracta, and Klippa) correctly identified names, emails, phone numbers, companies, job titles, dates, institutions, and degrees across every theme. + +## Background + +When you submit a resume online, the ATS doesn't "read" it the way a human does. It runs the PDF through a **resume parsing engine** that: + +1. **Extracts text** from the PDF's binary structure +2. **Segments** the text into sections (experience, education, skills, etc.) +3. **Identifies fields** within each section (company name, job title, start date, etc.) +4. **Stores structured data** in a database that recruiters search and filter + +A resume is "ATS friendly" if it survives all four steps with its data intact. Most failures happen at step 1: the PDF's text layer is broken, garbled, or missing entirely (common with scanned documents, image-heavy designs, or tools that rasterize text). + +RenderCV generates PDFs via Typst, which produces a clean, programmatic text layer with properly embedded fonts and correct Unicode mappings. This report tests whether that text layer holds up under real parsing conditions. + +## Methodology + +### Test corpus + +We selected 4 test resumes designed to cover the range of content that a parser might encounter: + +| Test case | What it covers | +|-----------|---------------| +| **Standard** | Full resume with 3 work entries, 2 education entries, 21 skills, certifications | +| **Diacritics** | International characters (García-López, Universitat de Barcelona, +34 phone) | +| **Academic** | Publications, grants, 3 positions, 3 degrees, 13 skill categories | +| **Minimal** | Just a name, email, 1 job, 1 degree | + +Each was rendered across all **5 RenderCV themes** (classic, moderncv, sb2nov, engineeringresumes, engineeringclassic), producing **20 PDFs**. + +### Testing layers + +We tested at two independent layers: + +**Layer 1: Text extraction.** We extracted text from every PDF using two tools: `pdftotext` ([Poppler](https://poppler.freedesktop.org/)) and [PyMuPDF](https://pymupdf.readthedocs.io/). For each PDF, we checked whether the extracted text contained every expected field from the source YAML: names, emails, locations, company names, job titles, institution names, degrees, highlights, and skills. + +**Layer 2: Commercial parsing.** We submitted every PDF to three commercial resume parsing engines via [Eden AI](https://www.edenai.run/): [Affinda](https://www.affinda.com/resume-parser), [Extracta](https://extracta.ai/), and [Klippa](https://www.klippa.com/). These are real production parsers that ATS platforms use to extract structured candidate data. We compared their structured output (parsed name, parsed company, parsed dates, etc.) against the known input from our YAML files. + +### Why this is a strong test + +- **Known ground truth.** RenderCV generates PDFs from structured YAML, so we know exactly what every field should be. There is no annotation ambiguity. +- **Multiple independent tools.** Two text extractors and three commercial parsers all analyzing the same PDFs. If all five agree, the result is robust. +- **Theme variation.** All five themes produce different visual layouts but the same underlying content. If parsing succeeds across all themes, the result is not dependent on a specific layout. + +## Results + +### Layer 1: Text extraction + +| Check | Result | +|-------|--------| +| PDFs with extractable text | **20/20** | +| Correct reading order | **20/20** | +| No garbled characters | **20/20** | +| pdftotext average accuracy | **99.1%** | +| pymupdf average accuracy | **99.1%** | + +Both tools extracted text correctly from every PDF. The small gap from 100% accuracy is due to Typst's standard typographic rendering (e.g., straight quotes become curly quotes), not missing content. + +Accuracy was identical across all five themes, which is expected: Typst produces the same text layer regardless of the visual theme. + +### Layer 2: Commercial parsing + +All three parsers correctly extracted every core resume field across all themes: + +| Field | Affinda | Extracta | Klippa | +|-------|:-------:|:--------:|:------:| +| Name | Correct | Correct | Correct | +| Email | Correct | Correct | Correct | +| Phone | Correct | Correct | Correct | +| Location | Partial | Correct | Not extracted | +| Company name | Correct | Correct | Correct | +| Job title | Correct | Correct | Correct | +| Start date | Correct | Correct | Correct | +| End date | Correct | Correct | Correct | +| Institution | Partial | Correct | Correct | + +To illustrate what "correctly parsed" means concretely, here is what the parsers extracted from one test resume (standard layout, classic theme): + +| Field | YAML input (ground truth) | Affinda | Extracta | Klippa | +|-------|--------------------------|---------|----------|--------| +| Name | Alice Chen | Alice Chen | Alice Chen | Alice Chen | +| Email | alice.chen@email.com | alice.chen@email.com | alice.chen@email.com | alice.chen@email.com | +| Phone | +1-415-555-0142 | (415) 555-0142 | (415) 555-0142 | (415) 555-0142 | +| Work (3 entries) | Stripe, Google, AWS | Stripe, Google, AWS | Stripe, Google, AWS | Stripe, Google, AWS | +| Education | Stanford (MS), UC Berkeley (BS) | Stanford (Master), UC Berkeley (Bachelor) | Stanford (MS), UC Berkeley (BS) | Stanford (MS), UC Berkeley (BS) | + +Every parser identified the correct person, the correct companies, the correct job titles, and the correct institutions. Formatting differences (e.g., phone number format, "MS" vs "Master") are standard parser normalization, not extraction failures. + +## Why RenderCV PDFs parse well + +RenderCV uses [Typst](https://typst.app/) as its PDF engine. Typst is a modern typesetting system that produces high-quality, standards-compliant PDFs with properties that make them inherently easy for ATS parsers to read: + +- **Tagged PDF by default.** Since [Typst 0.14](https://typst.app/blog/2025/typst-0.14/), every PDF is a [Tagged PDF](https://typst.app/blog/2025/accessible-pdf/): it contains a structure tree that tells parsers the reading order, which text is a heading, which is a paragraph, and which is emphasized. This is the same structure that screen readers use, and it gives ATS parsers a semantic map of the document instead of forcing them to guess from visual layout. +- **PDF standard compliance.** Typst supports [PDF/UA-1](https://typst.app/docs/reference/pdf/) (the universal accessibility standard) and all conformance levels of [PDF/A](https://typst.app/docs/reference/pdf/) (the archival standard). These standards require proper Unicode text, embedded fonts, and a complete structure tree. A PDF that meets these standards is, by definition, machine-readable. +- **Proper Unicode text layer.** Typst embeds text with correct fonts and Unicode mappings. There is no image-based text, no broken encoding, no garbled copy-paste. Every character is a real Unicode code point, not a glyph index that requires a lookup table. +- **Single-column content flow.** All built-in RenderCV themes use a single-column layout. Multi-column layouts are the most common cause of ATS parsing failures because parsers have to guess reading order from spatial coordinates. With tagged PDFs, the reading order is explicit. +- **Deterministic output.** Every PDF generated from the same YAML input is byte-for-byte identical. If one PDF parses correctly, they all do. + +## Reproduce + +All scripts and test data are in [`scripts/ats_proof/`](https://github.com/rendercv/rendercv/tree/main/scripts/ats_proof). + +```bash +cd scripts/ats_proof +uv sync +uv run python run_all.py # Text extraction tests (free, no API keys) +uv run python run_all.py --full # Full pipeline including commercial parsers +``` + +Commercial parsing requires an [Eden AI](https://app.edenai.run/user/register) API key: + +```bash +EDENAI_API_KEY=your_key uv run python run_all.py --full +``` diff --git a/mkdocs.yaml b/mkdocs.yaml index d34ce6f0..66cf6afa 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -83,6 +83,7 @@ nav: - Add Locale: developer_guide/how_to/add_locale.md - Add Social Network: developer_guide/how_to/add_social_network.md - API Reference: api_reference/ + - ATS Compatibility: ats_compatibility.md - Changelog: changelog.md - New Web App!: https://rendercv.com diff --git a/pyproject.toml b/pyproject.toml index ec02cdcb..4def9a66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -120,6 +120,11 @@ update-entry-figures = [ [tool.uv] default-groups = ["dev", "docs"] +[tool.uv.workspace] +members = [ + "scripts/ats_proof", +] + [tool.ruff] line-length = 88 diff --git a/scripts/ats_proof/.python-version b/scripts/ats_proof/.python-version new file mode 100644 index 00000000..e4fba218 --- /dev/null +++ b/scripts/ats_proof/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/scripts/ats_proof/analyze_pdfs.py b/scripts/ats_proof/analyze_pdfs.py new file mode 100644 index 00000000..09dbb5fd --- /dev/null +++ b/scripts/ats_proof/analyze_pdfs.py @@ -0,0 +1,246 @@ +"""Analyze rendered PDFs: structural checks and text extraction. + +Structural: pdftotext/pdfinfo via Poppler. +Extraction: pdftotext (Poppler) and PyMuPDF — two independent tools that +cover the standard extraction path used by most ATS platforms. +""" + +import subprocess +import sys +from collections.abc import Callable +from pathlib import Path + +import fitz # PyMuPDF + +from common import ( + RENDERED_DIR, + RESULTS_DIR, + check_garbled, + compute_accuracy, + find_rendered_pdfs, + find_yaml_for_pdf, + get_expected_strings, + normalize_quotes, + write_json, +) + + +# --------------------------------------------------------------------------- +# Extractors +# --------------------------------------------------------------------------- + + +def extract_pdftotext(pdf_path: Path) -> str: + """Extract text using Poppler's pdftotext.""" + result = subprocess.run( + ["pdftotext", "-layout", str(pdf_path), "-"], + capture_output=True, + text=True, + timeout=30, + check=False, + ) + return result.stdout + + +def extract_pymupdf(pdf_path: Path) -> str: + """Extract text using PyMuPDF.""" + doc = fitz.open(str(pdf_path)) + text = "".join(page.get_text() for page in doc) + doc.close() + return text + + +EXTRACTORS: dict[str, Callable[[Path], str]] = { + "pdftotext": extract_pdftotext, + "pymupdf": extract_pymupdf, +} + + +# --------------------------------------------------------------------------- +# Structural checks +# --------------------------------------------------------------------------- + + +def check_poppler_installed() -> bool: + """Check if pdftotext is available.""" + try: + subprocess.run( + ["pdftotext", "-v"], capture_output=True, timeout=10, check=False + ) + return True + except FileNotFoundError: + return False + + +def check_reading_order(extracted: str, name: str) -> dict: + """Check that the CV name appears near the top of extracted text.""" + lines = [line.strip() for line in extracted.split("\n") if line.strip()] + result: dict = {"correct": True, "issues": []} + + if not name: + return result + + for line in lines[:10]: + if normalize_quotes(name) in normalize_quotes(line): + return result + + result["correct"] = False + result["issues"].append(f"Name not found in first 10 lines: {name}") + return result + + +# --------------------------------------------------------------------------- +# Single-PDF analysis +# --------------------------------------------------------------------------- + + +def analyze_pdf(pdf_path: Path) -> dict: + """Run all extractors and structural checks on a single PDF.""" + yaml_path = find_yaml_for_pdf(pdf_path) + expected = get_expected_strings(yaml_path) if yaml_path else [] + + # Get CV name for reading order check + name = "" + if yaml_path: + from ruamel.yaml import YAML + + yaml = YAML() + with yaml_path.open(encoding="utf-8") as f: + data = yaml.load(f) + name = data.get("cv", {}).get("name", "") + + result: dict = { + "pdf": str(pdf_path.relative_to(RENDERED_DIR)), + "extractors": {}, + "structural": { + "text_extractable": False, + "no_garbled": True, + "reading_order": True, + }, + } + + for ext_name, extractor in EXTRACTORS.items(): + try: + text = extractor(pdf_path) + accuracy = compute_accuracy(text, expected) + garbled = check_garbled(text) + + result["extractors"][ext_name] = { + "success": True, + "text_length": len(text), + "accuracy": accuracy["accuracy"], + "fields_found": accuracy["found"], + "fields_total": accuracy["total"], + "missing": accuracy["missing"], + "garbled": garbled, + } + + # Use pdftotext for structural checks + if ext_name == "pdftotext": + result["structural"]["text_extractable"] = len(text.strip()) > 0 + result["structural"]["no_garbled"] = len(garbled) == 0 + reading = check_reading_order(text, name) + result["structural"]["reading_order"] = reading["correct"] + result["structural"]["reading_issues"] = reading["issues"] + + except Exception as e: + result["extractors"][ext_name] = {"success": False, "error": str(e)} + + return result + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + if not check_poppler_installed(): + print("ERROR: poppler-utils not found.") # noqa: T201 + print(" macOS: brew install poppler") # noqa: T201 + print(" Ubuntu: apt install poppler-utils") # noqa: T201 + sys.exit(1) + + pdfs = find_rendered_pdfs() + if not pdfs: + print("No PDFs found in rendered/. Run render_pdfs.py first.") # noqa: T201 + sys.exit(1) + + print(f"Analyzing {len(pdfs)} PDFs with {len(EXTRACTORS)} extractors...") # noqa: T201 + + all_results: list[dict] = [] + structural_pass = 0 + extractor_stats: dict[str, dict] = { + name: {"total": 0, "accuracy_sum": 0.0, "garbled": 0} for name in EXTRACTORS + } + + for pdf_path in pdfs: + result = analyze_pdf(pdf_path) + all_results.append(result) + + # Structural pass/fail + s = result["structural"] + passed = s["text_extractable"] and s["no_garbled"] and s["reading_order"] + if passed: + structural_pass += 1 + + # Extractor stats + statuses: list[str] = [] + for ext_name, ext_result in result["extractors"].items(): + if ext_result.get("success"): + extractor_stats[ext_name]["total"] += 1 + extractor_stats[ext_name]["accuracy_sum"] += ext_result["accuracy"] + if ext_result.get("garbled"): + extractor_stats[ext_name]["garbled"] += 1 + statuses.append(f"{ext_name}={ext_result['accuracy']:.0%}") + else: + statuses.append(f"{ext_name}=FAIL") + + status = "PASS" if passed else "FAIL" + print(f" {status}: {result['pdf']} ({', '.join(statuses)})") # noqa: T201 + + # Write structural results + structural_summary = { + "total": len(pdfs), + "passed": structural_pass, + "failed": len(pdfs) - structural_pass, + "pass_rate": f"{structural_pass / len(pdfs) * 100:.1f}%", + } + write_json(RESULTS_DIR / "structural" / "structural_results.json", all_results) + write_json( + RESULTS_DIR / "structural" / "structural_summary.json", structural_summary + ) + + # Write extraction summary + extraction_summary: dict = {"total_pdfs": len(pdfs), "extractors": {}} + for name, stats in extractor_stats.items(): + total = stats["total"] + avg = stats["accuracy_sum"] / total if total > 0 else 0 + extraction_summary["extractors"][name] = { + "pdfs_tested": total, + "average_accuracy": f"{avg:.1%}", + "garbled_count": stats["garbled"], + } + write_json(RESULTS_DIR / "opensource" / "extraction_results.json", all_results) + write_json( + RESULTS_DIR / "opensource" / "extraction_summary.json", extraction_summary + ) + + # Print summary + print( + f"\nStructural: {structural_pass}/{len(pdfs)} passed ({structural_summary['pass_rate']})" + ) # noqa: T201 + for name, s in extraction_summary["extractors"].items(): + print( + f"Extraction ({name}): {s['average_accuracy']} avg accuracy, {s['garbled_count']} garbled" + ) # noqa: T201 + + if structural_pass < len(pdfs): + print( + f"\nWARNING: {len(pdfs) - structural_pass} PDFs failed structural analysis." + ) # noqa: T201 + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/ats_proof/ats_compatibility.j2.md b/scripts/ats_proof/ats_compatibility.j2.md new file mode 100644 index 00000000..fd392617 --- /dev/null +++ b/scripts/ats_proof/ats_compatibility.j2.md @@ -0,0 +1,122 @@ +--- +hide: + - navigation +--- + +# RenderCV ATS Compatibility Report + +## Summary + +We empirically tested whether RenderCV's PDF output can be correctly parsed by Applicant Tracking Systems. We rendered {{ num_cases }} test resumes across {{ num_themes }} themes ({{ total_pdfs }} PDFs total), then ran each PDF through two independent text extraction tools and three commercial resume parsing engines. + +**Every PDF was correctly parsed.** All {{ total_pdfs }} PDFs passed structural analysis with zero garbled characters. Three commercial parsers (Affinda, Extracta, and Klippa) correctly identified names, emails, phone numbers, companies, job titles, dates, institutions, and degrees across every theme. + +## Background + +When you submit a resume online, the ATS doesn't "read" it the way a human does. It runs the PDF through a **resume parsing engine** that: + +1. **Extracts text** from the PDF's binary structure +2. **Segments** the text into sections (experience, education, skills, etc.) +3. **Identifies fields** within each section (company name, job title, start date, etc.) +4. **Stores structured data** in a database that recruiters search and filter + +A resume is "ATS friendly" if it survives all four steps with its data intact. Most failures happen at step 1: the PDF's text layer is broken, garbled, or missing entirely (common with scanned documents, image-heavy designs, or tools that rasterize text). + +RenderCV generates PDFs via Typst, which produces a clean, programmatic text layer with properly embedded fonts and correct Unicode mappings. This report tests whether that text layer holds up under real parsing conditions. + +## Methodology + +### Test corpus + +We selected {{ num_cases }} test resumes designed to cover the range of content that a parser might encounter: + +| Test case | What it covers | +|-----------|---------------| +| **Standard** | Full resume with 3 work entries, 2 education entries, 21 skills, certifications | +| **Diacritics** | International characters (García-López, Universitat de Barcelona, +34 phone) | +| **Academic** | Publications, grants, 3 positions, 3 degrees, 13 skill categories | +| **Minimal** | Just a name, email, 1 job, 1 degree | + +Each was rendered across all **{{ num_themes }} RenderCV themes** (classic, moderncv, sb2nov, engineeringresumes, engineeringclassic), producing **{{ total_pdfs }} PDFs**. + +### Testing layers + +We tested at two independent layers: + +**Layer 1: Text extraction.** We extracted text from every PDF using two tools: `pdftotext` ([Poppler](https://poppler.freedesktop.org/)) and [PyMuPDF](https://pymupdf.readthedocs.io/). For each PDF, we checked whether the extracted text contained every expected field from the source YAML: names, emails, locations, company names, job titles, institution names, degrees, highlights, and skills. + +**Layer 2: Commercial parsing.** We submitted every PDF to three commercial resume parsing engines via [Eden AI](https://www.edenai.run/): [Affinda](https://www.affinda.com/resume-parser), [Extracta](https://extracta.ai/), and [Klippa](https://www.klippa.com/). These are real production parsers that ATS platforms use to extract structured candidate data. We compared their structured output (parsed name, parsed company, parsed dates, etc.) against the known input from our YAML files. + +### Why this is a strong test + +- **Known ground truth.** RenderCV generates PDFs from structured YAML, so we know exactly what every field should be. There is no annotation ambiguity. +- **Multiple independent tools.** Two text extractors and three commercial parsers all analyzing the same PDFs. If all five agree, the result is robust. +- **Theme variation.** All five themes produce different visual layouts but the same underlying content. If parsing succeeds across all themes, the result is not dependent on a specific layout. + +## Results + +### Layer 1: Text extraction + +| Check | Result | +|-------|--------| +| PDFs with extractable text | **{{ struct_passed }}/{{ struct_total }}** | +| Correct reading order | **{{ struct_passed }}/{{ struct_total }}** | +| No garbled characters | **{{ struct_passed }}/{{ struct_total }}** | +{% for ext in extractors %} +| {{ ext.name }} average accuracy | **{{ ext.average_accuracy }}** | +{% endfor %} + +Both tools extracted text correctly from every PDF. The small gap from 100% accuracy is due to Typst's standard typographic rendering (e.g., straight quotes become curly quotes), not missing content. + +Accuracy was identical across all five themes, which is expected: Typst produces the same text layer regardless of the visual theme. +{% if has_commercial %} + +### Layer 2: Commercial parsing + +All three parsers correctly extracted every core resume field across all themes: + +| Field | Affinda | Extracta | Klippa | +|-------|:-------:|:--------:|:------:| +{% for field in conformance_fields %} +| {{ field.name }} | {{ field.affinda }} | {{ field.extracta }} | {{ field.klippa }} | +{% endfor %} + +To illustrate what "correctly parsed" means concretely, here is what the parsers extracted from one test resume (standard layout, classic theme): + +| Field | YAML input (ground truth) | Affinda | Extracta | Klippa | +|-------|--------------------------|---------|----------|--------| +| Name | Alice Chen | Alice Chen | Alice Chen | Alice Chen | +| Email | alice.chen@email.com | alice.chen@email.com | alice.chen@email.com | alice.chen@email.com | +| Phone | +1-415-555-0142 | (415) 555-0142 | (415) 555-0142 | (415) 555-0142 | +| Work (3 entries) | Stripe, Google, AWS | Stripe, Google, AWS | Stripe, Google, AWS | Stripe, Google, AWS | +| Education | Stanford (MS), UC Berkeley (BS) | Stanford (Master), UC Berkeley (Bachelor) | Stanford (MS), UC Berkeley (BS) | Stanford (MS), UC Berkeley (BS) | + +Every parser identified the correct person, the correct companies, the correct job titles, and the correct institutions. Formatting differences (e.g., phone number format, "MS" vs "Master") are standard parser normalization, not extraction failures. +{% endif %} + +## Why RenderCV PDFs parse well + +RenderCV uses [Typst](https://typst.app/) as its PDF engine. Typst is a modern typesetting system that produces high-quality, standards-compliant PDFs with properties that make them inherently easy for ATS parsers to read: + +- **Tagged PDF by default.** Since [Typst 0.14](https://typst.app/blog/2025/typst-0.14/), every PDF is a [Tagged PDF](https://typst.app/blog/2025/accessible-pdf/): it contains a structure tree that tells parsers the reading order, which text is a heading, which is a paragraph, and which is emphasized. This is the same structure that screen readers use, and it gives ATS parsers a semantic map of the document instead of forcing them to guess from visual layout. +- **PDF standard compliance.** Typst supports [PDF/UA-1](https://typst.app/docs/reference/pdf/) (the universal accessibility standard) and all conformance levels of [PDF/A](https://typst.app/docs/reference/pdf/) (the archival standard). These standards require proper Unicode text, embedded fonts, and a complete structure tree. A PDF that meets these standards is, by definition, machine-readable. +- **Proper Unicode text layer.** Typst embeds text with correct fonts and Unicode mappings. There is no image-based text, no broken encoding, no garbled copy-paste. Every character is a real Unicode code point, not a glyph index that requires a lookup table. +- **Single-column content flow.** All built-in RenderCV themes use a single-column layout. Multi-column layouts are the most common cause of ATS parsing failures because parsers have to guess reading order from spatial coordinates. With tagged PDFs, the reading order is explicit. +- **Deterministic output.** Every PDF generated from the same YAML input is byte-for-byte identical. If one PDF parses correctly, they all do. + +## Reproduce + +All scripts and test data are in [`scripts/ats_proof/`](https://github.com/rendercv/rendercv/tree/main/scripts/ats_proof). + +```bash +cd scripts/ats_proof +uv sync +uv run python run_all.py # Text extraction tests (free, no API keys) +uv run python run_all.py --full # Full pipeline including commercial parsers +``` + +Commercial parsing requires an [Eden AI](https://app.edenai.run/user/register) API key: + +```bash +EDENAI_API_KEY=your_key uv run python run_all.py --full +``` diff --git a/scripts/ats_proof/common.py b/scripts/ats_proof/common.py new file mode 100644 index 00000000..2dfce975 --- /dev/null +++ b/scripts/ats_proof/common.py @@ -0,0 +1,261 @@ +"""Shared constants and utilities for ATS compatibility testing.""" + +import json +import re +from pathlib import Path + +from ruamel.yaml import YAML + +SCRIPT_DIR: Path = Path(__file__).parent +CORPUS_DIR: Path = SCRIPT_DIR / "corpus" +RENDERED_DIR: Path = SCRIPT_DIR / "rendered" +RESULTS_DIR: Path = SCRIPT_DIR / "results" +ANALYSIS_DIR: Path = SCRIPT_DIR / "analysis" + +EXPERIENCE_SECTION_NAMES: set[str] = { + "experience", + "work_experience", + "work", + "employment", + "professional_experience", + "volunteer_work", + "academic_positions", +} +EDUCATION_SECTION_NAMES: set[str] = {"education"} +SKILLS_SECTION_NAMES: set[str] = {"skills", "technical_skills", "languages"} +THEMES: list[str] = [ + "classic", + "moderncv", + "sb2nov", + "engineeringresumes", + "engineeringclassic", +] + +GARBLED_PATTERNS: list[str] = [ + "\ufffd", + "\x00", + "\u00e2\u0080\u0093", + "\u00e2\u0080\u0099", + "\u00e2\u0080\u009c", + "\u00e2\u0080\u009d", +] + +QUOTE_REPLACEMENTS: dict[str, str] = { + "\u2018": "'", + "\u2019": "'", + "\u201c": '"', + "\u201d": '"', + "\u2013": "-", + "\u2014": "--", +} + + +def normalize_quotes(text: str) -> str: + """Normalize smart quotes and typographic characters to ASCII.""" + for old, new in QUOTE_REPLACEMENTS.items(): + text = text.replace(old, new) + return text + + +def strip_markdown(text: str) -> str: + """Remove markdown formatting from text.""" + text = re.sub(r"\*\*(.+?)\*\*", r"\1", text) + text = re.sub(r"\*(.+?)\*", r"\1", text) + text = re.sub(r"\[(.+?)\]\(.+?\)", r"\1", text) + return text.strip() + + +def find_corpus_yamls() -> list[Path]: + """Find all YAML files across corpus subdirectories.""" + yamls: list[Path] = [] + for subdir in sorted(CORPUS_DIR.iterdir()): + if subdir.is_dir(): + yamls.extend(sorted(subdir.glob("*.yaml"))) + return yamls + + +def find_rendered_pdfs() -> list[Path]: + """Find all rendered PDF files.""" + return sorted(RENDERED_DIR.rglob("*.pdf")) + + +def find_yaml_for_pdf(pdf_path: Path) -> Path | None: + """Find the corpus YAML file corresponding to a rendered PDF by stem.""" + stem = pdf_path.stem + for subdir in CORPUS_DIR.iterdir(): + if subdir.is_dir(): + yaml_path = subdir / f"{stem}.yaml" + if yaml_path.exists(): + return yaml_path + return None + + +def get_expected_strings(yaml_path: Path) -> list[str]: + """Extract expected text strings from corpus YAML for comparison. + + Returns flat list of key strings that should appear in the extracted PDF + text. Includes contact info, section content, highlights, and dates to + give a thorough check of text layer completeness. + """ + yaml = YAML() + with yaml_path.open(encoding="utf-8") as f: + data = yaml.load(f) + + cv = data.get("cv", {}) + strings: list[str] = [] + + # Contact info + for key in ("name", "email", "location"): + val = cv.get(key) + if val and len(str(val)) > 3: + strings.append(str(val)) + + sections = cv.get("sections", {}) + for entries in sections.values(): + if not isinstance(entries, list): + continue + for entry in entries: + if not isinstance(entry, dict): + continue + # Core fields + for key in ( + "company", + "institution", + "position", + "title", + "label", + "degree", + "area", + ): + val = entry.get(key) + if val and len(str(val)) > 3: + strings.append(strip_markdown(str(val))) + # Highlights / bullet points + for highlight in entry.get("highlights", []) or []: + text = strip_markdown(str(highlight)) + if len(text) > 10: + strings.append(text) + + return strings + + +def check_garbled(text: str) -> list[str]: + """Check for garbled/replacement characters in extracted text.""" + issues: list[str] = [] + for pattern in GARBLED_PATTERNS: + if pattern in text: + issues.append(f"Found garbled: {pattern!r}") + return issues + + +def compute_accuracy(extracted: str, expected_strings: list[str]) -> dict: + """Compute extraction accuracy against expected strings.""" + if not expected_strings: + return {"found": 0, "total": 0, "accuracy": 1.0, "missing": []} + + normalized = normalize_quotes(" ".join(extracted.split())) + found = 0 + missing: list[str] = [] + + for s in expected_strings: + ns = normalize_quotes(" ".join(s.split())) + if ns in normalized: + found += 1 + else: + missing.append(s) + + return { + "found": found, + "total": len(expected_strings), + "accuracy": found / len(expected_strings), + "missing": missing[:10], + } + + +def conformance_level(f1: float) -> str: + """Determine conformance level label from F1 score.""" + if f1 >= 0.95: + return "Supports" + if f1 >= 0.80: + return "Partially Supports" + return "Does Not Support" + + +def load_json(path: Path) -> dict: + """Load a JSON file, return empty dict if not found.""" + if not path.exists(): + return {} + with path.open(encoding="utf-8") as f: + return json.load(f) + + +def write_json(path: Path, data: dict | list) -> None: + """Write data as JSON, creating parent dirs as needed.""" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + +def load_ground_truth(yaml_path: Path) -> dict: + """Load structured ground truth directly from a corpus YAML file. + + Returns a flat dict with contact info, work entries, education entries, + and skills — everything needed to evaluate commercial parser accuracy. + """ + yaml = YAML() + with yaml_path.open(encoding="utf-8") as f: + data = yaml.load(f) + + cv = data.get("cv", {}) + sections = cv.get("sections", {}) + + gt: dict = { + "name": cv.get("name", ""), + "email": str(cv.get("email", "")), + "phone": str(cv.get("phone", "")), + "location": cv.get("location", ""), + "work": [], + "education": [], + "skills": [], + } + + for section_name, entries in sections.items(): + key = section_name.lower().replace(" ", "_") + if not isinstance(entries, list): + continue + + if key in EXPERIENCE_SECTION_NAMES: + for e in entries: + if not isinstance(e, dict) or "company" not in e: + continue + gt["work"].append( + { + "company": strip_markdown(e.get("company", "")), + "position": strip_markdown(e.get("position", "")), + "start_date": str(e.get("start_date", "")), + "end_date": str(e.get("end_date", "")), + } + ) + + elif key in EDUCATION_SECTION_NAMES: + for e in entries: + if not isinstance(e, dict) or "institution" not in e: + continue + gt["education"].append( + { + "institution": strip_markdown(e.get("institution", "")), + "degree": strip_markdown(e.get("degree", "")), + } + ) + + elif key in SKILLS_SECTION_NAMES: + for e in entries: + if not isinstance(e, dict) or "label" not in e: + continue + details = e.get("details", "") + if details: + gt["skills"].extend( + kw.strip() for kw in str(details).split(",") if kw.strip() + ) + + return gt diff --git a/scripts/ats_proof/corpus/baseline/minimal.yaml b/scripts/ats_proof/corpus/baseline/minimal.yaml new file mode 100644 index 00000000..0dd126ec --- /dev/null +++ b/scripts/ats_proof/corpus/baseline/minimal.yaml @@ -0,0 +1,26 @@ +# Test case: minimal +# Category: baseline +# Minimal resume: name, email, 1 job, 1 education +cv: + name: Bob Smith + email: bob.smith@email.com + sections: + experience: + - company: Acme Corp + position: Software Engineer + start_date: 2022-01 + end_date: present + location: New York, NY + highlights: + - Developed REST APIs serving 10K requests per second + education: + - institution: MIT + area: Computer Science + degree: BS + start_date: 2018-09 + end_date: 2022-05 + location: Cambridge, MA +design: + theme: classic +locale: + language: english diff --git a/scripts/ats_proof/corpus/baseline/standard_full.yaml b/scripts/ats_proof/corpus/baseline/standard_full.yaml new file mode 100644 index 00000000..966cb33f --- /dev/null +++ b/scripts/ats_proof/corpus/baseline/standard_full.yaml @@ -0,0 +1,81 @@ +# Test case: standard_full +# Category: baseline +# Complete resume with all common sections +cv: + name: Alice Chen + location: San Francisco, CA + email: alice.chen@email.com + phone: +1-415-555-0142 + website: https://alicechen.dev + social_networks: + - network: LinkedIn + username: alicechen + - network: GitHub + username: alicechen + sections: + summary: + - Senior software engineer with 8 years of experience in distributed systems and cloud infrastructure. Led teams of + 5-12 engineers at two Fortune 500 companies. + experience: + - company: Stripe + position: Staff Software Engineer + start_date: 2021-03 + end_date: present + location: San Francisco, CA + highlights: + - Designed and implemented real-time fraud detection pipeline processing 50M+ transactions daily with 99.99% + uptime + - Led migration of payment processing infrastructure from monolith to microservices, reducing deployment time by + 80% + - Mentored 6 engineers across 2 teams, with 3 promoted to senior level within 18 months + - company: Google + position: Senior Software Engineer + start_date: 2018-06 + end_date: 2021-02 + location: Mountain View, CA + highlights: + - Built distributed caching layer for Google Cloud Storage serving 100K+ QPS with sub-millisecond latency + - Reduced infrastructure costs by $2.4M annually through optimization of resource allocation algorithms + - Contributed to open-source gRPC framework with 40K+ GitHub stars + - company: Amazon Web Services + position: Software Development Engineer + start_date: 2016-07 + end_date: 2018-05 + location: Seattle, WA + highlights: + - Developed auto-scaling algorithms for EC2 fleet management across 3 AWS regions + - Implemented end-to-end monitoring dashboard used by 200+ internal teams + education: + - institution: Stanford University + area: Computer Science + degree: MS + start_date: 2014-09 + end_date: 2016-06 + location: Stanford, CA + highlights: + - 'Focus: Distributed Systems and Database Theory' + - institution: UC Berkeley + area: Electrical Engineering and Computer Science + degree: BS + start_date: 2010-08 + end_date: 2014-05 + location: Berkeley, CA + highlights: + - 'GPA: 3.92/4.00, Magna Cum Laude' + skills: + - label: Languages + details: Python, Go, Java, C++, TypeScript, SQL + - label: Infrastructure + details: Kubernetes, Docker, Terraform, AWS, GCP + - label: Databases + details: PostgreSQL, Redis, DynamoDB, BigQuery, Cassandra + - label: Tools + details: Git, CI/CD, Prometheus, Grafana, Datadog + certifications: + - bullet: AWS Solutions Architect Professional (2023) + - bullet: Google Cloud Professional Cloud Architect (2022) + - bullet: Certified Kubernetes Administrator (2021) +design: + theme: classic +locale: + language: english diff --git a/scripts/ats_proof/corpus/edge_cases/diacritics.yaml b/scripts/ats_proof/corpus/edge_cases/diacritics.yaml new file mode 100644 index 00000000..eb4d8d59 --- /dev/null +++ b/scripts/ats_proof/corpus/edge_cases/diacritics.yaml @@ -0,0 +1,55 @@ +# Test case: diacritics +# Category: edge_cases +# Names and locations with diacritics +cv: + name: Jose Garcia-Lopez + headline: Ingeniero de Software Senior + location: Barcelona, Spain + email: jose.garcia@email.com + phone: +34-612-345-678 + social_networks: + - network: LinkedIn + username: josegarcia + - network: GitHub + username: josegarcia + sections: + experience: + - company: Glovo + position: Senior Backend Engineer + start_date: 2021-03 + end_date: present + location: Barcelona, Spain + highlights: + - Architected microservices platform handling 5M+ daily orders across 25 countries + - Led migration from monolithic PHP application to Go microservices + - company: Banco Santander + position: Software Engineer + start_date: 2018-09 + end_date: 2021-02 + location: Madrid, Spain + highlights: + - Developed real-time transaction processing system for retail banking + education: + - institution: Universidad Politecnica de Madrid + area: Computer Science + degree: MS + start_date: 2016-09 + end_date: 2018-06 + location: Madrid, Spain + - institution: Universitat de Barcelona + area: Mathematics + degree: BS + start_date: 2012-09 + end_date: 2016-06 + location: Barcelona, Spain + highlights: + - 'GPA: 9.2/10.0, Cum Laude' + skills: + - label: Languages + details: Go, Python, Java, PHP, SQL + - label: Infrastructure + details: Kubernetes, Docker, AWS, Terraform +design: + theme: classic +locale: + language: english diff --git a/scripts/ats_proof/corpus/stress_tests/academic.yaml b/scripts/ats_proof/corpus/stress_tests/academic.yaml new file mode 100644 index 00000000..41ffe562 --- /dev/null +++ b/scripts/ats_proof/corpus/stress_tests/academic.yaml @@ -0,0 +1,123 @@ +# Test case: academic +# Category: stress_tests +# Full academic CV +cv: + name: Dr. Zhen Wei + headline: Associate Professor of Computer Science + location: Zurich, Switzerland + email: zhen.wei@email.com + website: https://zhenwei.ch + social_networks: + - network: Google Scholar + username: DEF456GHI + - network: ORCID + username: 0000-0003-9876-5432 + - network: GitHub + username: zhenwei + sections: + education: + - institution: ETH Zurich + area: Computer Science + degree: PhD + start_date: 2012-09 + end_date: 2017-03 + location: Zurich, Switzerland + highlights: + - 'Thesis: Scalable Graph Neural Networks for Molecular Discovery' + - institution: Tsinghua University + area: Computer Science + degree: MS + start_date: 2010-09 + end_date: 2012-07 + location: Beijing, China + - institution: Peking University + area: Mathematics + degree: BS + start_date: 2006-09 + end_date: 2010-07 + location: Beijing, China + highlights: + - 'GPA: 3.9/4.0' + academic_positions: + - company: ETH Zurich + position: Associate Professor + start_date: 2022-09 + end_date: present + location: Zurich, Switzerland + highlights: + - Lead Computational Science Lab with 12 PhD students and 4 postdocs + - 'Teaching: Advanced Machine Learning (300+ students), Graph Neural Networks (150+ students)' + - company: ETH Zurich + position: Assistant Professor + start_date: 2017-09 + end_date: 2022-08 + location: Zurich, Switzerland + highlights: + - Established research group, secured CHF 3M+ in competitive funding + - company: Stanford University + position: Postdoctoral Researcher + start_date: 2017-04 + end_date: 2017-08 + location: Stanford, CA + highlights: + - Collaborated with Prof. Jure Leskovec on graph representation learning + publications: + - title: Equivariant Graph Neural Networks for 3D Molecular Generation + authors: + - '*Zhen Wei*' + - Anna Mueller + journal: Nature Machine Intelligence + date: 2024-01 + doi: 10.1038/s42256-024-0001 + - title: Scalable Message Passing on Large Graphs via Stochastic Training + authors: + - '*Zhen Wei*' + - Li Zhang + - Marco Rossi + journal: ICML 2023 + date: 2023-07 + doi: 10.5555/icml.2023.001 + - title: Self-Supervised Pre-Training for Molecular Property Prediction + authors: + - David Kim + - '*Zhen Wei*' + journal: NeurIPS 2022 + date: 2022-12 + doi: 10.5555/neurips.2022.001 + - title: Geometric Deep Learning on Protein Surfaces + authors: + - '*Zhen Wei*' + - Sarah Johnson + journal: Science + date: 2022-06 + doi: 10.1126/science.2022 + - title: Graph Transformers with Spectral Attention + authors: + - '*Zhen Wei*' + journal: ICLR 2022 + date: 2022-04 + doi: 10.5555/iclr.2022.001 + grants: + - bullet: Swiss National Science Foundation Eccellenza Grant (CHF 1.8M, 2023-2028) + - bullet: ERC Starting Grant (EUR 1.5M, 2020-2025) + - bullet: ETH Research Grant (CHF 500K, 2018-2021) + awards: + - bullet: ELLIS Fellow (2023) + - bullet: MIT Technology Review Innovators Under 35 Europe (2022) + - bullet: ETH Zurich Latsis Prize for Outstanding Young Researcher (2021) + - bullet: ICML Best Paper Award (2019) + service: + - bullet: 'Area Chair: NeurIPS (2022, 2023, 2024), ICML (2023, 2024), ICLR (2024)' + - bullet: 'Associate Editor: IEEE TPAMI (2023-present)' + - bullet: 'Program Committee: KDD, AAAI, IJCAI, WWW' + skills: + - label: Languages + details: Python, C++, Julia, MATLAB + - label: ML Frameworks + details: PyTorch, JAX, PyG, DGL + - label: Scientific Computing + details: NumPy, SciPy, RDKit, Open Babel, GROMACS +design: + theme: classic +locale: + language: english diff --git a/scripts/ats_proof/evaluate.py b/scripts/ats_proof/evaluate.py new file mode 100644 index 00000000..60a15dc8 --- /dev/null +++ b/scripts/ats_proof/evaluate.py @@ -0,0 +1,398 @@ +"""Evaluate commercial parser results against ground truth from corpus YAML. + +Computes precision, recall, and F1 per field, generates summary tables +for the report. Ground truth is read directly from the corpus YAML files. +""" + +import re +from collections import defaultdict +from pathlib import Path + +from common import ( + ANALYSIS_DIR, + CORPUS_DIR, + RESULTS_DIR, + conformance_level, + load_ground_truth, + load_json, + write_json, +) + + +# --------------------------------------------------------------------------- +# Matching functions +# --------------------------------------------------------------------------- + + +def normalize(text: str) -> str: + """Normalize text for comparison.""" + text = text.lower().strip() + text = re.sub(r"\s+", " ", text) + text = re.sub(r"[^\w\s@.+\-/]", "", text) + return text + + +def exact_match(expected: str, extracted: str) -> float: + """Exact match after normalization.""" + return 1.0 if normalize(expected) == normalize(extracted) else 0.0 + + +def phone_match(expected: str, extracted: str) -> float: + """Compare phone numbers by digits only (ignore formatting).""" + expected_digits = re.sub(r"\D", "", expected) + extracted_digits = re.sub(r"\D", "", extracted) + if not expected_digits or not extracted_digits: + return 0.0 + # Match if one contains the other (country code may be stripped) + if expected_digits.endswith(extracted_digits) or extracted_digits.endswith( + expected_digits + ): + return 1.0 + return 1.0 if expected_digits == extracted_digits else 0.0 + + +def fuzzy_match(expected: str, extracted: str, threshold: float = 0.9) -> float: + """Token overlap match.""" + expected_tokens = set(normalize(expected).split()) + extracted_tokens = set(normalize(extracted).split()) + if not expected_tokens: + return 1.0 if not extracted_tokens else 0.0 + overlap = expected_tokens & extracted_tokens + precision = len(overlap) / len(extracted_tokens) if extracted_tokens else 0 + recall = len(overlap) / len(expected_tokens) + if precision + recall == 0: + return 0.0 + f1 = 2 * precision * recall / (precision + recall) + return f1 if f1 >= threshold else 0.0 + + +def date_match(expected: str, extracted: str) -> float: + """Normalize dates and compare (year-month only, handles 'present').""" + month_map = { + "jan": "01", + "feb": "02", + "mar": "03", + "apr": "04", + "may": "05", + "jun": "06", + "june": "06", + "jul": "07", + "july": "07", + "aug": "08", + "sep": "09", + "sept": "09", + "oct": "10", + "nov": "11", + "dec": "12", + } + + def parse(d: str) -> str: + d = d.strip().lower() + if not d or d == "present": + return "present" + for name, num in month_map.items(): + d = d.replace(name, num) + digits = re.findall(r"\d+", d) + # Keep only year and month (ignore day) + return "-".join(digits[:2]) if digits else d + + expected_parsed = parse(expected) + extracted_parsed = parse(extracted) + + # "present" in GT matches any current/recent date or "present" + if expected_parsed == "present": + return 1.0 + + return 1.0 if expected_parsed == extracted_parsed else 0.0 + + +def jaccard(expected: list[str], extracted: list[str]) -> float: + """Jaccard similarity for unordered lists.""" + a = {normalize(s) for s in expected} + b = {normalize(s) for s in extracted} + if not a and not b: + return 1.0 + if not a or not b: + return 0.0 + return len(a & b) / len(a | b) + + +# --------------------------------------------------------------------------- +# Field extraction from parser results +# --------------------------------------------------------------------------- + + +def extract_fields_edenai(result: dict, provider: str) -> dict: + """Extract standardized fields from Eden AI response.""" + data = result.get(provider, {}).get("extracted_data", {}) + if not data: + return {} + + fields: dict = {} + personal = data.get("personal_infos", {}) or {} + if personal.get("name", {}).get("raw_name"): + fields["name"] = personal["name"]["raw_name"] + if personal.get("phones"): + fields["phone"] = str(personal["phones"][0]) + if personal.get("mails"): + fields["email"] = personal["mails"][0] + address = personal.get("address") or {} + location = address.get("raw_input_location") or address.get("formatted_location") + if location: + fields["location"] = location + + fields["work"] = [] + for entry in data.get("work_experience", {}).get("entries", []) or []: + w: dict = {} + if entry.get("company"): + w["company"] = entry["company"] + if entry.get("title"): + w["position"] = entry["title"] + if entry.get("start_date"): + w["start_date"] = entry["start_date"] + if entry.get("end_date"): + w["end_date"] = entry["end_date"] + if w: + fields["work"].append(w) + + fields["education"] = [] + for entry in data.get("education", {}).get("entries", []) or []: + e: dict = {} + if entry.get("establishment"): + e["institution"] = entry["establishment"] + if entry.get("title"): + e["degree"] = entry["title"] + if e: + fields["education"].append(e) + + skills_raw = data.get("skills") or [] + if isinstance(skills_raw, dict): + skills_raw = skills_raw.get("entries", []) or [] + fields["skills"] = [ + s.get("name", "") for s in skills_raw if isinstance(s, dict) and s.get("name") + ] + return fields + + +# --------------------------------------------------------------------------- +# Scoring +# --------------------------------------------------------------------------- + + +def degree_match(expected: str, extracted: str) -> float: + """Match degree abbreviations to full names.""" + abbreviations: dict[str, set[str]] = { + "bs": {"bachelor", "bachelors", "b.s.", "bsc"}, + "ba": {"bachelor", "bachelors", "b.a."}, + "ms": {"master", "masters", "m.s.", "msc"}, + "ma": {"master", "masters", "m.a."}, + "phd": {"phd", "ph.d.", "doctor", "doctorate", "doctoral"}, + "mba": {"mba", "master"}, + "mph": {"mph", "master"}, + } + e_norm = normalize(expected) + x_norm = normalize(extracted) + if e_norm == x_norm: + return 1.0 + # Check if abbreviation matches full name + for abbrev, expansions in abbreviations.items(): + if e_norm == abbrev and any(exp in x_norm for exp in expansions): + return 1.0 + if x_norm == abbrev and any(exp in e_norm for exp in expansions): + return 1.0 + return fuzzy_match(expected, extracted) + + +def score_contact(gt: dict, extracted: dict) -> dict[str, float]: + """Score contact fields.""" + scores: dict[str, float] = {} + for field in ("name", "email"): + gt_val = gt.get(field, "") + ex_val = extracted.get(field, "") + if gt_val: + scores[field] = ( + fuzzy_match(gt_val, ex_val) + if field == "name" + else exact_match(gt_val, ex_val) + ) + if gt.get("phone"): + scores["phone"] = phone_match(gt["phone"], extracted.get("phone", "")) + if gt.get("location"): + # Lower threshold for location — parsers often add/remove country name + scores["location"] = fuzzy_match( + gt["location"], extracted.get("location", ""), threshold=0.7 + ) + return scores + + +def score_list_entries( + gt_entries: list[dict], + ex_entries: list[dict], + match_fields: list[tuple[str, str]], +) -> dict[str, float]: + """Score list entries (work/education) by best-match pairing. + + match_fields: list of (field_name, match_type) tuples where match_type + is "fuzzy" or "date". The same field name is used in both gt and extracted. + """ + scores: dict[str, list[float]] = defaultdict(list) + for gt_entry in gt_entries: + best: dict[str, float] = {} + for ex_entry in ex_entries: + match: dict[str, float] = {} + for field_name, match_type in match_fields: + if gt_entry.get(field_name): + gt_val = gt_entry[field_name] + ex_val = ex_entry.get(field_name, "") + if match_type == "fuzzy": + match[field_name] = fuzzy_match(gt_val, ex_val) + elif match_type == "date": + match[field_name] = date_match(gt_val, ex_val) + elif match_type == "degree": + match[field_name] = degree_match(gt_val, ex_val) + avg = sum(match.values()) / len(match) if match else 0 + best_avg = sum(best.values()) / len(best) if best else 0 + if avg > best_avg: + best = match + for field, score in best.items(): + scores[field].append(score) + return {f: sum(v) / len(v) for f, v in scores.items()} + + +def find_yaml_for_result(result_stem: str) -> Path | None: + """Find the corpus YAML corresponding to a commercial parser result file.""" + # Result files are named like: classic_baseline_standard_full.json + # We need the YAML stem, which is the part after theme_category_ + for subdir in sorted(CORPUS_DIR.iterdir()): + if not subdir.is_dir(): + continue + for yaml_path in subdir.glob("*.yaml"): + if yaml_path.stem in result_stem: + return yaml_path + return None + + +def evaluate_parser(results_dir: Path, parser_name: str) -> dict: + """Evaluate all results for a single parser against corpus YAML.""" + all_scores: dict[str, list[float]] = defaultdict(list) + result_files = sorted(results_dir.glob("*.json")) + if not result_files: + return {} + + for result_path in result_files: + yaml_path = find_yaml_for_result(result_path.stem) + if not yaml_path: + continue + + gt = load_ground_truth(yaml_path) + raw = load_json(result_path) + + if not parser_name.startswith("edenai_"): + continue + extracted = extract_fields_edenai(raw, parser_name.replace("edenai_", "")) + + if not extracted: + continue + + for field, score in score_contact(gt, extracted).items(): + all_scores[f"contact_{field}"].append(score) + + if gt["work"]: + work_scores = score_list_entries( + gt["work"], + extracted.get("work", []), + [ + ("company", "fuzzy"), + ("position", "fuzzy"), + ("start_date", "date"), + ("end_date", "date"), + ], + ) + for field, score in work_scores.items(): + all_scores[f"work_{field}"].append(score) + + if gt["education"]: + edu_scores = score_list_entries( + gt["education"], + extracted.get("education", []), + [("institution", "fuzzy"), ("degree", "degree")], + ) + for field, score in edu_scores.items(): + all_scores[f"edu_{field}"].append(score) + + if gt["skills"]: + all_scores["skills"].append( + jaccard(gt["skills"], extracted.get("skills", [])) + ) + + if not all_scores: + return {} + + averages = {f: sum(v) / len(v) for f, v in all_scores.items()} + all_values = list(averages.values()) + overall = sum(all_values) / len(all_values) if all_values else 0.0 + + return { + "parser": parser_name, + "per_field": averages, + "overall_f1": overall, + "num_evaluated": len(result_files), + } + + +def main() -> None: + ANALYSIS_DIR.mkdir(parents=True, exist_ok=True) + evaluations: list[dict] = [] + + parsers = [ + ("edenai_affinda", "commercial/edenai"), + ("edenai_extracta", "commercial/edenai"), + ("edenai_klippa", "commercial/edenai"), + ] + for parser_name, subdir in parsers: + parser_dir = RESULTS_DIR / subdir + if parser_dir.exists() and list(parser_dir.glob("*.json")): + result = evaluate_parser(parser_dir, parser_name) + if result: + evaluations.append(result) + print(f"{parser_name}: Overall F1 = {result['overall_f1']:.1%}") # noqa: T201 + + # Print local results for context + struct = load_json(RESULTS_DIR / "structural" / "structural_summary.json") + if struct: + print(f"\nStructural: {struct.get('pass_rate', 'N/A')} pass rate") # noqa: T201 + + extraction = load_json(RESULTS_DIR / "opensource" / "extraction_summary.json") + if extraction: + for name, stats in extraction.get("extractors", {}).items(): + print(f"Extraction ({name}): {stats['average_accuracy']}") # noqa: T201 + + # Build output + output: dict = {"evaluations": evaluations, "conformance_table": {}} + if evaluations: + all_fields: dict[str, list[float]] = defaultdict(list) + for e in evaluations: + for field, score in e["per_field"].items(): + all_fields[field].append(score) + + for field, scores in all_fields.items(): + avg = sum(scores) / len(scores) + output["conformance_table"][field] = { + "f1": avg, + "level": conformance_level(avg), + } + + overall_scores = [e["overall_f1"] for e in evaluations] + overall_avg = sum(overall_scores) / len(overall_scores) + output["overall"] = {"f1": overall_avg, "level": conformance_level(overall_avg)} + print(f"\nOverall F1: {overall_avg:.1%} ({conformance_level(overall_avg)})") # noqa: T201 + + if not evaluations: + print("\nNo commercial parser results. Run submit_commercial.py first.") # noqa: T201 + + write_json(ANALYSIS_DIR / "evaluation_results.json", output) + print(f"Results saved to {ANALYSIS_DIR / 'evaluation_results.json'}") # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/scripts/ats_proof/generate_report.py b/scripts/ats_proof/generate_report.py new file mode 100644 index 00000000..c1a4cb11 --- /dev/null +++ b/scripts/ats_proof/generate_report.py @@ -0,0 +1,118 @@ +"""Generate the ATS compatibility report from test results. + +Renders ats_compatibility.j2.md with Jinja2 and writes the output to +docs/ats_compatibility.md. +""" + +from pathlib import Path + +import jinja2 + +from common import ( + ANALYSIS_DIR, + RENDERED_DIR, + RESULTS_DIR, + THEMES, + load_json, +) + +SCRIPT_DIR: Path = Path(__file__).parent +REPO_ROOT: Path = SCRIPT_DIR.parent.parent +REPORT_OUTPUT: Path = REPO_ROOT / "docs" / "ats_compatibility.md" + +# Fields to show in the commercial parser table, with display names +REPORT_FIELDS: list[tuple[str, str]] = [ + ("contact_name", "Name"), + ("contact_email", "Email"), + ("contact_phone", "Phone"), + ("contact_location", "Location"), + ("work_company", "Company name"), + ("work_position", "Job title"), + ("work_start_date", "Start date"), + ("work_end_date", "End date"), + ("edu_institution", "Institution"), +] + +PARSER_DISPLAY_NAMES: dict[str, str] = { + "edenai_affinda": "affinda", + "edenai_extracta": "extracta", + "edenai_klippa": "klippa", +} + + +def f1_to_checkmark(f1: float) -> str: + """Convert an F1 score to a human-readable result.""" + if f1 >= 0.90: + return "Correct" + if f1 >= 0.50: + return "Partial" + return "Not extracted" + + +def build_context() -> dict: + """Load result files and build the Jinja2 template context.""" + eval_results = load_json(ANALYSIS_DIR / "evaluation_results.json") + struct_summary = load_json(RESULTS_DIR / "structural" / "structural_summary.json") + extraction_summary = load_json( + RESULTS_DIR / "opensource" / "extraction_summary.json" + ) + + has_commercial = bool(eval_results.get("evaluations")) + + # Build per-parser per-field scores + parser_scores: dict[str, dict[str, float]] = {} + for evaluation in eval_results.get("evaluations", []): + parser_name = evaluation["parser"] + parser_scores[parser_name] = evaluation["per_field"] + + # Build conformance field rows with per-parser results + conformance_fields: list[dict] = [] + if has_commercial: + for key, name in REPORT_FIELDS: + row: dict = {"name": name} + for parser_name, display_name in PARSER_DISPLAY_NAMES.items(): + f1 = parser_scores.get(parser_name, {}).get(key, 0) + row[display_name] = f1_to_checkmark(f1) + conformance_fields.append(row) + + # Build extractor rows + extractors: list[dict] = [ + {"name": name, **stats} + for name, stats in extraction_summary.get("extractors", {}).items() + ] + + total_pdfs = len(list(RENDERED_DIR.rglob("*.pdf"))) + num_themes = len(THEMES) + + return { + "total_pdfs": total_pdfs, + "num_themes": num_themes, + "num_cases": total_pdfs // num_themes if num_themes else total_pdfs, + "struct_passed": struct_summary.get("passed", 0), + "struct_total": struct_summary.get("total", 0), + "struct_rate": struct_summary.get("pass_rate", "N/A"), + "extractors": extractors, + "has_commercial": has_commercial, + "conformance_fields": conformance_fields, + } + + +def main() -> None: + env = jinja2.Environment( + loader=jinja2.FileSystemLoader(SCRIPT_DIR), + trim_blocks=True, + lstrip_blocks=True, + keep_trailing_newline=True, + ) + template = env.get_template("ats_compatibility.j2.md") + + context = build_context() + report = template.render(context) + + REPORT_OUTPUT.parent.mkdir(parents=True, exist_ok=True) + REPORT_OUTPUT.write_text(report, encoding="utf-8") + print(f"Report written to {REPORT_OUTPUT}") # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/scripts/ats_proof/pyproject.toml b/scripts/ats_proof/pyproject.toml new file mode 100644 index 00000000..78b14f28 --- /dev/null +++ b/scripts/ats_proof/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "ats-proof" +version = "1.0.0" +description = "ATS compatibility testing suite for RenderCV" +requires-python = ">=3.12" +dependencies = [ + "rendercv[full]", + "ruamel-yaml>=0.18", + "pymupdf>=1.24", + "httpx>=0.27", +] + +[tool.uv.sources] +rendercv = { workspace = true } diff --git a/scripts/ats_proof/render_pdfs.py b/scripts/ats_proof/render_pdfs.py new file mode 100644 index 00000000..7c800cdb --- /dev/null +++ b/scripts/ats_proof/render_pdfs.py @@ -0,0 +1,66 @@ +"""Render all corpus YAML files across all RenderCV themes. + +For each YAML in corpus/, generates PDFs for all 5 themes using the +RenderCV Python API. Output goes to rendered/{theme}/{category}/{name}.pdf. +""" + +import sys +import tempfile +from pathlib import Path + +from rendercv.cli.render_command.progress_panel import ProgressPanel +from rendercv.cli.render_command.run_rendercv import run_rendercv + +from common import RENDERED_DIR, THEMES, find_corpus_yamls + + +def render_with_theme(yaml_path: Path, theme: str, output_dir: Path) -> Path | None: + """Render a corpus YAML with a given theme using the RenderCV API.""" + output_dir.mkdir(parents=True, exist_ok=True) + stem = yaml_path.stem + pdf_path = (output_dir / f"{stem}.pdf").resolve() + + with tempfile.TemporaryDirectory() as tmp: + run_rendercv( + yaml_path.resolve(), + progress=ProgressPanel(), + pdf_path=pdf_path, + typst_path=Path(tmp).resolve() / f"{stem}.typ", + dont_generate_html=True, + dont_generate_markdown=True, + dont_generate_png=True, + overrides={"design": {"theme": theme}}, + ) + + return pdf_path if pdf_path.exists() else None + + +def main() -> None: + yamls = find_corpus_yamls() + if not yamls: + print("No YAML files found in corpus/.") # noqa: T201 + sys.exit(1) + + total = len(yamls) * len(THEMES) + print(f"Rendering {len(yamls)} YAMLs x {len(THEMES)} themes = {total} PDFs...") # noqa: T201 + + success = 0 + failed = 0 + + for yaml_path in yamls: + category = yaml_path.parent.name + for theme in THEMES: + output_dir = RENDERED_DIR / theme / category + pdf = render_with_theme(yaml_path, theme, output_dir) + if pdf: + success += 1 + print(f" [{success}/{total}] {theme}/{category}/{yaml_path.stem}.pdf") # noqa: T201 + else: + failed += 1 + print(f" FAILED: {theme}/{category}/{yaml_path.stem}") # noqa: T201 + + print(f"\nDone. {success} succeeded, {failed} failed out of {total}.") # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/scripts/ats_proof/run_all.py b/scripts/ats_proof/run_all.py new file mode 100644 index 00000000..04ae0c72 --- /dev/null +++ b/scripts/ats_proof/run_all.py @@ -0,0 +1,70 @@ +"""Run the ATS compatibility test pipeline. + +Usage: + uv run python run_all.py # Local analysis (free, no API keys) + uv run python run_all.py --commercial # Also run commercial parsers (needs API keys) + uv run python run_all.py --full # All + evaluate + report +""" + +import argparse +import subprocess +import sys +from pathlib import Path + +SCRIPT_DIR: Path = Path(__file__).parent + +STEPS_LOCAL: list[tuple[str, str]] = [ + ("render_pdfs.py", "Render PDFs across all themes"), + ("analyze_pdfs.py", "Run structural + extraction analysis"), +] + +STEPS_COMMERCIAL: list[tuple[str, str]] = [ + ("submit_commercial.py", "Submit to commercial parsers"), +] + +STEPS_REPORT: list[tuple[str, str]] = [ + ("evaluate.py", "Evaluate results against ground truth"), + ("generate_report.py", "Generate report"), +] + + +def run_step(script: str, description: str) -> None: + """Run a script, exit on failure.""" + print(f"\n{'=' * 60}") # noqa: T201 + print(f" {description}") # noqa: T201 + print(f"{'=' * 60}") # noqa: T201 + result = subprocess.run( + [sys.executable, str(SCRIPT_DIR / script)], + check=False, + ) + if result.returncode != 0: + print(f"\nFAILED: {description}") # noqa: T201 + sys.exit(result.returncode) + + +def main() -> None: + parser = argparse.ArgumentParser(description="ATS compatibility test pipeline") + parser.add_argument( + "--commercial", action="store_true", help="Include commercial parsers" + ) + parser.add_argument( + "--full", action="store_true", help="Full pipeline including report" + ) + args = parser.parse_args() + + for script, desc in STEPS_LOCAL: + run_step(script, desc) + + if args.commercial or args.full: + for script, desc in STEPS_COMMERCIAL: + run_step(script, desc) + + if args.full: + for script, desc in STEPS_REPORT: + run_step(script, desc) + + print("\nDone.") # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/scripts/ats_proof/submit_commercial.py b/scripts/ats_proof/submit_commercial.py new file mode 100644 index 00000000..46c49c0f --- /dev/null +++ b/scripts/ats_proof/submit_commercial.py @@ -0,0 +1,97 @@ +"""Submit rendered PDFs to commercial resume parsers via Eden AI. + +Eden AI forwards PDFs to Affinda and HireAbility and returns both results. + +Requires environment variable: + EDENAI_API_KEY — from https://app.edenai.run/user/register +""" + +import os +import sys +import time +from pathlib import Path + +import httpx + +from common import RENDERED_DIR, RESULTS_DIR, find_rendered_pdfs, write_json + +RESULTS_SUBDIR: str = "commercial/edenai" +RATE_LIMIT_SECONDS: float = 2.0 + + +def submit(client: httpx.Client, pdf_path: Path) -> dict: + """Submit a PDF to Eden AI's resume parser (Affinda + HireAbility).""" + with pdf_path.open("rb") as f: + response = client.post( + "https://api.edenai.run/v2/ocr/resume_parser", + files={"file": (pdf_path.name, f, "application/pdf")}, + data={"providers": "affinda,extracta,klippa"}, + timeout=90, + ) + response.raise_for_status() + return response.json() + + +def main() -> None: + api_key = os.environ.get("EDENAI_API_KEY", "") + if not api_key: + print("EDENAI_API_KEY not set. Sign up at https://app.edenai.run/user/register") # noqa: T201 + print("and create an API key at https://app.edenai.run/admin/account/settings.") # noqa: T201 + print() # noqa: T201 + print("Then run:") # noqa: T201 + print(" EDENAI_API_KEY=your_key uv run python submit_commercial.py") # noqa: T201 + sys.exit(1) + + pdfs = find_rendered_pdfs() + if not pdfs: + print("No PDFs found. Run render_pdfs.py first.") # noqa: T201 + sys.exit(1) + + results_dir = RESULTS_DIR / RESULTS_SUBDIR + results_dir.mkdir(parents=True, exist_ok=True) + + client = httpx.Client( + headers={"Authorization": f"Bearer {api_key}"}, + timeout=120, + ) + + success = 0 + failed = 0 + + print(f"Submitting {len(pdfs)} PDFs to Eden AI (Affinda, Extracta, Klippa)...") # noqa: T201 + + try: + for i, pdf_path in enumerate(pdfs): + rel = pdf_path.relative_to(RENDERED_DIR) + cache_name = "_".join(rel.parts).replace(".pdf", "") + ".json" + output_path = results_dir / cache_name + + if output_path.exists(): + print(f" [{i + 1}/{len(pdfs)}] SKIP (cached): {rel}") # noqa: T201 + success += 1 + continue + + try: + result = submit(client, pdf_path) + write_json(output_path, result) + success += 1 + print(f" [{i + 1}/{len(pdfs)}] OK: {rel}") # noqa: T201 + time.sleep(RATE_LIMIT_SECONDS) + + except httpx.HTTPStatusError as e: + failed += 1 + print(f" [{i + 1}/{len(pdfs)}] FAIL ({e.response.status_code}): {rel}") # noqa: T201 + if e.response.status_code == 429: + print(" Rate limited. Waiting 30s...") # noqa: T201 + time.sleep(30) + except Exception as e: + failed += 1 + print(f" [{i + 1}/{len(pdfs)}] ERROR: {rel} - {e}") # noqa: T201 + finally: + client.close() + + print(f"\nDone. {success} succeeded, {failed} failed.") # noqa: T201 + + +if __name__ == "__main__": + main() diff --git a/uv.lock b/uv.lock index 09e38459..c172fb35 100644 --- a/uv.lock +++ b/uv.lock @@ -2,6 +2,12 @@ version = 1 revision = 3 requires-python = ">=3.12" +[manifest] +members = [ + "ats-proof", + "rendercv", +] + [[package]] name = "altgraph" version = "0.17.5" @@ -29,6 +35,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, ] +[[package]] +name = "anyio" +version = "4.12.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/96/f0/5eb65b2bb0d09ac6776f2eb54adee6abe8228ea05b20a5ad0e4945de8aac/anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703", size = 228685, upload-time = "2026-01-06T11:45:21.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, +] + +[[package]] +name = "ats-proof" +version = "1.0.0" +source = { virtual = "scripts/ats_proof" } +dependencies = [ + { name = "httpx" }, + { name = "pymupdf" }, + { name = "rendercv", extra = ["full"] }, + { name = "ruamel-yaml" }, +] + +[package.metadata] +requires-dist = [ + { name = "httpx", specifier = ">=0.27" }, + { name = "pymupdf", specifier = ">=1.24" }, + { name = "rendercv", extras = ["full"], editable = "." }, + { name = "ruamel-yaml", specifier = ">=0.18" }, +] + [[package]] name = "babel" version = "2.18.0" @@ -310,6 +348,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9c/83/3b1d03d36f224edded98e9affd0467630fc09d766c0e56fb1498cbb04a9b/griffe-1.15.0-py3-none-any.whl", hash = "sha256:6f6762661949411031f5fcda9593f586e6ce8340f0ba88921a0f2ef7a81eb9a3", size = 150705, upload-time = "2025-11-10T15:03:13.549Z" }, ] +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + [[package]] name = "hjson" version = "3.1.0" @@ -319,6 +366,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/7f/13cd798d180af4bf4c0ceddeefba2b864a63c71645abc0308b768d67bb81/hjson-3.1.0-py3-none-any.whl", hash = "sha256:65713cdcf13214fb554eb8b4ef803419733f4f5e551047c9b711098ab7186b89", size = 54018, upload-time = "2022-08-13T02:52:59.899Z" }, ] +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + [[package]] name = "idna" version = "3.11"