Add 'auto' output type for best-effort PDF/A without Ghostscript

- Add new '--output-type auto' option (now the default) that produces best-effort PDF/A without requiring Ghostscript - When verapdf is available, use speculative PDF/A conversion - Without verapdf, pass through as PDF/A if safe (input claims PDF/A or --force-ocr was used), otherwise output as regular PDF - Make Ghostscript check conditional - only required for pdfa* output types - Update soft error tests to explicitly use --output-type pdfa since they exercise Ghostscript failure modes - Fix Tesseract OSD error handling to check both stdout and stderr for known non-fatal messages like "Too few characters"
2026-05-24 22:46:07 -04:00 · 2026-01-09 00:56:00 -08:00
parent bdc50e9470
commit 0c4ee5af4e
7 changed files with 153 additions and 51 deletions
--- a/src/ocrmypdf/_exec/tesseract.py
+++ b/src/ocrmypdf/_exec/tesseract.py
@@ -202,9 +202,11 @@ def get_orientation(
    except CalledProcessError as e:
        tesseract_log_output(e.stdout)
        tesseract_log_output(e.stderr)
+        # Check both stdout (e.output) and stderr for known non-fatal messages
+        all_output = (e.output or b'') + (e.stderr or b'')
        if (
-            b'Too few characters. Skipping this page' in e.output
-            or b'Image too large' in e.output
+            b'Too few characters. Skipping this page' in all_output
+            or b'Image too large' in all_output
        ):
            return OrientationConfidence(0, 0)
        raise SubprocessOutputError() from e
--- a/src/ocrmypdf/_options.py
+++ b/src/ocrmypdf/_options.py
@@ -88,7 +88,7 @@ class OCROptions(BaseModel):

    # Core OCR options
    languages: list[str] = Field(default_factory=lambda: [DEFAULT_LANGUAGE])
-    output_type: str = 'pdfa'
+    output_type: str = 'auto'
    force_ocr: bool = False
    skip_text: bool = False
    redo_ocr: bool = False
@@ -190,7 +190,7 @@ class OCROptions(BaseModel):
    @classmethod
    def validate_output_type(cls, v):
        """Validate output type is one of the allowed values."""
-        valid_types = {'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}
+        valid_types = {'auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}
        if v not in valid_types:
            raise ValueError(f"output_type must be one of {valid_types}")
        return v
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -36,7 +36,11 @@ from ocrmypdf.exceptions import (
    UnsupportedImageFormatError,
 )
 from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution, safe_symlink
-from ocrmypdf.pdfa import generate_pdfa_ps, speculative_pdfa_conversion
+from ocrmypdf.pdfa import (
+    file_claims_pdfa,
+    generate_pdfa_ps,
+    speculative_pdfa_conversion,
+)
 from ocrmypdf.pdfinfo import Colorspace, Encoding, FloatRect, PageInfo, PdfInfo
 from ocrmypdf.pluginspec import OrientationConfidence

@@ -991,6 +995,73 @@ def try_speculative_pdfa(input_pdf: Path, context: PdfContext) -> Path | None:
        return None


+def try_auto_pdfa(input_pdf: Path, context: PdfContext) -> tuple[Path, str]:
+    """Best-effort PDF/A for 'auto' output type.
+
+    This function attempts to produce PDF/A without requiring Ghostscript:
+    1. If verapdf is available, tries speculative conversion with validation
+    2. Without verapdf, passes through as PDF/A if safe (input already PDF/A
+       or force-ocr was used)
+    3. Falls back to regular PDF if neither condition is met
+
+    Args:
+        input_pdf: Path to the PDF to convert
+        context: The PDF context
+
+    Returns:
+        Tuple of (output_path, actual_output_type) where actual_output_type
+        is 'pdfa' if PDF/A was achieved, 'pdf' otherwise
+    """
+    from ocrmypdf._exec import verapdf
+
+    # If verapdf available, try speculative conversion with validation
+    if verapdf.available():
+        result = try_speculative_pdfa(input_pdf, context)
+        if result is not None:
+            return (result, 'pdfa')
+        # verapdf validation failed - fall through to regular PDF
+        log.info(
+            'Auto mode: speculative PDF/A validation failed, outputting regular PDF'
+        )
+        return (input_pdf, 'pdf')
+
+    # Without verapdf, check if we can pass through as PDF/A
+    if _is_safe_pdfa(input_pdf, context.options):
+        # Pass through as-is (no modifications needed)
+        log.info('Auto mode: passing through as PDF/A (input already compliant)')
+        return (input_pdf, 'pdfa')
+
+    # Fall through to regular PDF
+    log.info('Auto mode: no verapdf available and input is not PDF/A, outputting PDF')
+    return (input_pdf, 'pdf')
+
+
+def _is_safe_pdfa(input_pdf: Path, options) -> bool:
+    """Check if file can be considered PDF/A without validation.
+
+    These are cases where our modifications don't break PDF/A compliance:
+    1. Input already claims PDF/A (we just grafted OCR text onto it)
+    2. We used force-ocr (we rewrote the entire PDF from scratch)
+
+    Args:
+        input_pdf: Path to the PDF to check
+        options: OCR options
+
+    Returns:
+        True if file can safely be considered PDF/A
+    """
+    # Safe if input already claims PDF/A
+    pdfa_status = file_claims_pdfa(input_pdf)
+    if pdfa_status['pass']:
+        return True
+
+    # Safe if we rewrote the PDF with force-ocr
+    if options.force_ocr:
+        return True
+
+    return False
+
+
 def should_linearize(working_file: Path, context: PdfContext) -> bool:
    """Determine whether the PDF should be linearized.

--- a/src/ocrmypdf/_pipelines/_common.py
+++ b/src/ocrmypdf/_pipelines/_common.py
@@ -45,6 +45,7 @@ from ocrmypdf._pipeline import (
    rasterize_preview,
    should_linearize,
    should_visible_page_image_use_jpg,
+    try_auto_pdfa,
    try_speculative_pdfa,
 )
 from ocrmypdf._plugin_manager import OcrmypdfPluginManager
@@ -469,8 +470,13 @@ def postprocess(
            pdf_out = fix_annots
        else:
            pdf_out = pdf_file
-    if context.options.output_type.startswith('pdfa'):
-        # Try speculative PDF/A conversion first (fast path using pikepdf + verapdf)
+    if context.options.output_type == 'auto':
+        # Best effort PDF/A - never uses Ghostscript
+        pdf_out, actual_type = try_auto_pdfa(pdf_out, context)
+        # Store actual output type for reporting
+        context.options.extra_attrs['_actual_output_type'] = actual_type
+    elif context.options.output_type.startswith('pdfa'):
+        # Required PDF/A - uses Ghostscript as fallback
        speculative_result = try_speculative_pdfa(pdf_out, context)
        if speculative_result is not None:
            pdf_out = speculative_result
@@ -495,7 +501,22 @@ def report_output_pdf(options, start_input_file, optimize_messages) -> ExitCode:
    elif samefile(options.output_file, Path(os.devnull)):
        pass  # Say nothing when sending to dev null
    else:
-        if options.output_type.startswith('pdfa'):
+        if options.output_type == 'auto':
+            # For 'auto' mode, check what we actually produced
+            actual_type = options.extra_attrs.get('_actual_output_type', 'pdf')
+            pdfa_info = file_claims_pdfa(options.output_file)
+            if actual_type == 'pdfa' and pdfa_info['pass']:
+                log.info(
+                    "Output file is a %s (auto mode achieved PDF/A)",
+                    pdfa_info['conformance'],
+                )
+            elif pdfa_info['pass']:
+                # Unexpectedly got PDF/A
+                log.info("Output file is a %s", pdfa_info['conformance'])
+            else:
+                # Regular PDF - this is expected for auto mode fallback
+                log.info("Output file is a PDF (auto mode)")
+        elif options.output_type.startswith('pdfa'):
            pdfa_info = file_claims_pdfa(options.output_file)
            if pdfa_info['pass']:
                log.info("Output file is a %s (as expected)", pdfa_info['conformance'])
--- a/src/ocrmypdf/builtin_plugins/ghostscript.py
+++ b/src/ocrmypdf/builtin_plugins/ghostscript.py
@@ -101,38 +101,41 @@ def add_options(parser):
@hookimpl
 def check_options(options):
    """Check that the options are valid for this plugin."""
-    check_external_program(
-        program='gs',
-        package='ghostscript',
-        version_checker=ghostscript.version,
-        need_version='9.54',  # RHEL 9's version; Ubuntu 22.04 has 9.55
-    )
-    gs_version = ghostscript.version()
-    if gs_version in BLACKLISTED_GS_VERSIONS:
-        raise MissingDependencyError(
-            f"Ghostscript {gs_version} contains serious regressions and is not "
-            "supported. Please upgrade to a newer version."
+    # Only require Ghostscript for pdfa* output types (not 'auto' or 'pdf')
+    # 'auto' mode uses best-effort PDF/A without Ghostscript fallback
+    if options.output_type.startswith('pdfa'):
+        check_external_program(
+            program='gs',
+            package='ghostscript',
+            version_checker=ghostscript.version,
+            need_version='9.54',  # RHEL 9's version; Ubuntu 22.04 has 9.55
        )
-    if Version('10.0.0') <= gs_version < Version('10.02.1') and (
-        options.skip_text or options.redo_ocr
-    ):
-        raise MissingDependencyError(
-            f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) "
-            "contain serious regressions that corrupt PDFs with existing text, "
-            "such as those processed using --skip-text or --redo-ocr. "
-            "Please upgrade to a "
-            "newer version, or use --output-type pdf to avoid Ghostscript, or "
-            "use --force-ocr to discard existing text."
-        )
-    if gs_version >= Version('10.6.0') and options.output_type.startswith('pdfa'):
-        log.warning(
-            "Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
-            "images. OCRmyPDF will attempt to mitigate, but this version is "
-            "strongly not recommended. Please upgrade to a newer version. "
-            "As of 2025-12, 10.6.0 is the latest version of Ghostscript."
-        )
-    if options.output_type == 'pdfa':
-        options.output_type = 'pdfa-2'
+        gs_version = ghostscript.version()
+        if gs_version in BLACKLISTED_GS_VERSIONS:
+            raise MissingDependencyError(
+                f"Ghostscript {gs_version} contains serious regressions and is not "
+                "supported. Please upgrade to a newer version."
+            )
+        if Version('10.0.0') <= gs_version < Version('10.02.1') and (
+            options.skip_text or options.redo_ocr
+        ):
+            raise MissingDependencyError(
+                f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) "
+                "contain serious regressions that corrupt PDFs with existing text, "
+                "such as those processed using --skip-text or --redo-ocr. "
+                "Please upgrade to a "
+                "newer version, or use --output-type pdf to avoid Ghostscript, or "
+                "use --force-ocr to discard existing text."
+            )
+        if gs_version >= Version('10.6.0'):
+            log.warning(
+                "Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
+                "images. OCRmyPDF will attempt to mitigate, but this version is "
+                "strongly not recommended. Please upgrade to a newer version. "
+                "As of 2025-12, 10.6.0 is the latest version of Ghostscript."
+            )
+        if options.output_type == 'pdfa':
+            options.output_type = 'pdfa-2'

    if (
        options.ghostscript.color_conversion_strategy
@@ -144,11 +147,11 @@ def check_options(options):
        )
    if (
        options.ghostscript.pdfa_image_compression != 'auto'
-        and not options.output_type.startswith('pdfa')
+        and options.output_type not in ('auto', 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3')
    ):
        log.warning(
            "--pdfa-image-compression argument only applies when "
-            "--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
+            "--output-type is 'auto' or one of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'"
        )


--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@@ -159,16 +159,17 @@ Online documentation is located at:
    )
    parser.add_argument(
        '--output-type',
-        choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],
-        default='pdfa',
-        help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
-        "long term archiving (default, recommended) but may not suitable "
-        "for users who want their file altered as little as possible. 'pdfa' "
-        "also has problems with full Unicode text. 'pdf' minimizes changes "
-        "to the input file. 'pdf-a1' creates a "
-        "PDF/A-1b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
-        "PDF/A-3b file. 'none' will produce no output, which may be helpful if "
-        "only the --sidecar is desired.",
+        choices=['auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],
+        default='auto',
+        help="Choose output type. 'auto' (default) produces best-effort PDF/A "
+        "without requiring Ghostscript - uses verapdf validation when available, "
+        "otherwise passes through as PDF/A if safe (input already PDF/A or "
+        "force-ocr was used), or falls back to regular PDF. 'pdfa' creates a "
+        "PDF/A-2b compliant file for long term archiving (requires Ghostscript "
+        "as fallback). 'pdf' minimizes changes to the input file. 'pdfa-1' "
+        "creates a PDF/A-1b file. 'pdfa-2' is equivalent to 'pdfa'. 'pdfa-3' "
+        "creates a PDF/A-3b file. 'none' will produce no output, which may be "
+        "helpful if only the --sidecar is desired.",
    )

    # Use null string '\0' as sentinel to indicate the user supplied no argument,
--- a/tests/test_soft_error.py
+++ b/tests/test_soft_error.py
@@ -41,6 +41,8 @@ def test_render_continue_on_soft_error(resources, outpdf):
    exitcode = run_ocrmypdf_api(
        resources / 'francais.pdf',
        outpdf,
+        '--output-type',
+        'pdfa',  # Required to trigger Ghostscript PDF/A generation
        '--continue-on-soft-render-error',
        '--plugin',
        'tests/plugins/tesseract_noop.py',
@@ -55,6 +57,8 @@ def test_render_stop_on_soft_error(resources, outpdf):
    exitcode = run_ocrmypdf_api(
        resources / 'francais.pdf',
        outpdf,
+        '--output-type',
+        'pdfa',  # Required to trigger Ghostscript PDF/A generation
        '--plugin',
        'tests/plugins/tesseract_noop.py',
        '--plugin',