From 0c4ee5af4e85ee06f27f3b40bdb6ace5d0785ada Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Fri, 9 Jan 2026 00:56:00 -0800 Subject: [PATCH] Add 'auto' output type for best-effort PDF/A without Ghostscript - Add new '--output-type auto' option (now the default) that produces best-effort PDF/A without requiring Ghostscript - When verapdf is available, use speculative PDF/A conversion - Without verapdf, pass through as PDF/A if safe (input claims PDF/A or --force-ocr was used), otherwise output as regular PDF - Make Ghostscript check conditional - only required for pdfa* output types - Update soft error tests to explicitly use --output-type pdfa since they exercise Ghostscript failure modes - Fix Tesseract OSD error handling to check both stdout and stderr for known non-fatal messages like "Too few characters" --- src/ocrmypdf/_exec/tesseract.py | 6 +- src/ocrmypdf/_options.py | 4 +- src/ocrmypdf/_pipeline.py | 73 ++++++++++++++++++++- src/ocrmypdf/_pipelines/_common.py | 27 +++++++- src/ocrmypdf/builtin_plugins/ghostscript.py | 69 +++++++++---------- src/ocrmypdf/cli.py | 21 +++--- tests/test_soft_error.py | 4 ++ 7 files changed, 153 insertions(+), 51 deletions(-) diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py index 47d31e3e..87109dca 100644 --- a/src/ocrmypdf/_exec/tesseract.py +++ b/src/ocrmypdf/_exec/tesseract.py @@ -202,9 +202,11 @@ def get_orientation( except CalledProcessError as e: tesseract_log_output(e.stdout) tesseract_log_output(e.stderr) + # Check both stdout (e.output) and stderr for known non-fatal messages + all_output = (e.output or b'') + (e.stderr or b'') if ( - b'Too few characters. Skipping this page' in e.output - or b'Image too large' in e.output + b'Too few characters. Skipping this page' in all_output + or b'Image too large' in all_output ): return OrientationConfidence(0, 0) raise SubprocessOutputError() from e diff --git a/src/ocrmypdf/_options.py b/src/ocrmypdf/_options.py index 338a7736..aec663fb 100644 --- a/src/ocrmypdf/_options.py +++ b/src/ocrmypdf/_options.py @@ -88,7 +88,7 @@ class OCROptions(BaseModel): # Core OCR options languages: list[str] = Field(default_factory=lambda: [DEFAULT_LANGUAGE]) - output_type: str = 'pdfa' + output_type: str = 'auto' force_ocr: bool = False skip_text: bool = False redo_ocr: bool = False @@ -190,7 +190,7 @@ class OCROptions(BaseModel): @classmethod def validate_output_type(cls, v): """Validate output type is one of the allowed values.""" - valid_types = {'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'} + valid_types = {'auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'} if v not in valid_types: raise ValueError(f"output_type must be one of {valid_types}") return v diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 56b6e788..33781a93 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -36,7 +36,11 @@ from ocrmypdf.exceptions import ( UnsupportedImageFormatError, ) from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution, safe_symlink -from ocrmypdf.pdfa import generate_pdfa_ps, speculative_pdfa_conversion +from ocrmypdf.pdfa import ( + file_claims_pdfa, + generate_pdfa_ps, + speculative_pdfa_conversion, +) from ocrmypdf.pdfinfo import Colorspace, Encoding, FloatRect, PageInfo, PdfInfo from ocrmypdf.pluginspec import OrientationConfidence @@ -991,6 +995,73 @@ def try_speculative_pdfa(input_pdf: Path, context: PdfContext) -> Path | None: return None +def try_auto_pdfa(input_pdf: Path, context: PdfContext) -> tuple[Path, str]: + """Best-effort PDF/A for 'auto' output type. + + This function attempts to produce PDF/A without requiring Ghostscript: + 1. If verapdf is available, tries speculative conversion with validation + 2. Without verapdf, passes through as PDF/A if safe (input already PDF/A + or force-ocr was used) + 3. Falls back to regular PDF if neither condition is met + + Args: + input_pdf: Path to the PDF to convert + context: The PDF context + + Returns: + Tuple of (output_path, actual_output_type) where actual_output_type + is 'pdfa' if PDF/A was achieved, 'pdf' otherwise + """ + from ocrmypdf._exec import verapdf + + # If verapdf available, try speculative conversion with validation + if verapdf.available(): + result = try_speculative_pdfa(input_pdf, context) + if result is not None: + return (result, 'pdfa') + # verapdf validation failed - fall through to regular PDF + log.info( + 'Auto mode: speculative PDF/A validation failed, outputting regular PDF' + ) + return (input_pdf, 'pdf') + + # Without verapdf, check if we can pass through as PDF/A + if _is_safe_pdfa(input_pdf, context.options): + # Pass through as-is (no modifications needed) + log.info('Auto mode: passing through as PDF/A (input already compliant)') + return (input_pdf, 'pdfa') + + # Fall through to regular PDF + log.info('Auto mode: no verapdf available and input is not PDF/A, outputting PDF') + return (input_pdf, 'pdf') + + +def _is_safe_pdfa(input_pdf: Path, options) -> bool: + """Check if file can be considered PDF/A without validation. + + These are cases where our modifications don't break PDF/A compliance: + 1. Input already claims PDF/A (we just grafted OCR text onto it) + 2. We used force-ocr (we rewrote the entire PDF from scratch) + + Args: + input_pdf: Path to the PDF to check + options: OCR options + + Returns: + True if file can safely be considered PDF/A + """ + # Safe if input already claims PDF/A + pdfa_status = file_claims_pdfa(input_pdf) + if pdfa_status['pass']: + return True + + # Safe if we rewrote the PDF with force-ocr + if options.force_ocr: + return True + + return False + + def should_linearize(working_file: Path, context: PdfContext) -> bool: """Determine whether the PDF should be linearized. diff --git a/src/ocrmypdf/_pipelines/_common.py b/src/ocrmypdf/_pipelines/_common.py index 6a6f40a2..7df92ca7 100644 --- a/src/ocrmypdf/_pipelines/_common.py +++ b/src/ocrmypdf/_pipelines/_common.py @@ -45,6 +45,7 @@ from ocrmypdf._pipeline import ( rasterize_preview, should_linearize, should_visible_page_image_use_jpg, + try_auto_pdfa, try_speculative_pdfa, ) from ocrmypdf._plugin_manager import OcrmypdfPluginManager @@ -469,8 +470,13 @@ def postprocess( pdf_out = fix_annots else: pdf_out = pdf_file - if context.options.output_type.startswith('pdfa'): - # Try speculative PDF/A conversion first (fast path using pikepdf + verapdf) + if context.options.output_type == 'auto': + # Best effort PDF/A - never uses Ghostscript + pdf_out, actual_type = try_auto_pdfa(pdf_out, context) + # Store actual output type for reporting + context.options.extra_attrs['_actual_output_type'] = actual_type + elif context.options.output_type.startswith('pdfa'): + # Required PDF/A - uses Ghostscript as fallback speculative_result = try_speculative_pdfa(pdf_out, context) if speculative_result is not None: pdf_out = speculative_result @@ -495,7 +501,22 @@ def report_output_pdf(options, start_input_file, optimize_messages) -> ExitCode: elif samefile(options.output_file, Path(os.devnull)): pass # Say nothing when sending to dev null else: - if options.output_type.startswith('pdfa'): + if options.output_type == 'auto': + # For 'auto' mode, check what we actually produced + actual_type = options.extra_attrs.get('_actual_output_type', 'pdf') + pdfa_info = file_claims_pdfa(options.output_file) + if actual_type == 'pdfa' and pdfa_info['pass']: + log.info( + "Output file is a %s (auto mode achieved PDF/A)", + pdfa_info['conformance'], + ) + elif pdfa_info['pass']: + # Unexpectedly got PDF/A + log.info("Output file is a %s", pdfa_info['conformance']) + else: + # Regular PDF - this is expected for auto mode fallback + log.info("Output file is a PDF (auto mode)") + elif options.output_type.startswith('pdfa'): pdfa_info = file_claims_pdfa(options.output_file) if pdfa_info['pass']: log.info("Output file is a %s (as expected)", pdfa_info['conformance']) diff --git a/src/ocrmypdf/builtin_plugins/ghostscript.py b/src/ocrmypdf/builtin_plugins/ghostscript.py index ea915773..40abf682 100644 --- a/src/ocrmypdf/builtin_plugins/ghostscript.py +++ b/src/ocrmypdf/builtin_plugins/ghostscript.py @@ -101,38 +101,41 @@ def add_options(parser): @hookimpl def check_options(options): """Check that the options are valid for this plugin.""" - check_external_program( - program='gs', - package='ghostscript', - version_checker=ghostscript.version, - need_version='9.54', # RHEL 9's version; Ubuntu 22.04 has 9.55 - ) - gs_version = ghostscript.version() - if gs_version in BLACKLISTED_GS_VERSIONS: - raise MissingDependencyError( - f"Ghostscript {gs_version} contains serious regressions and is not " - "supported. Please upgrade to a newer version." + # Only require Ghostscript for pdfa* output types (not 'auto' or 'pdf') + # 'auto' mode uses best-effort PDF/A without Ghostscript fallback + if options.output_type.startswith('pdfa'): + check_external_program( + program='gs', + package='ghostscript', + version_checker=ghostscript.version, + need_version='9.54', # RHEL 9's version; Ubuntu 22.04 has 9.55 ) - if Version('10.0.0') <= gs_version < Version('10.02.1') and ( - options.skip_text or options.redo_ocr - ): - raise MissingDependencyError( - f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) " - "contain serious regressions that corrupt PDFs with existing text, " - "such as those processed using --skip-text or --redo-ocr. " - "Please upgrade to a " - "newer version, or use --output-type pdf to avoid Ghostscript, or " - "use --force-ocr to discard existing text." - ) - if gs_version >= Version('10.6.0') and options.output_type.startswith('pdfa'): - log.warning( - "Ghostscript 10.6.x contains JPEG encoding errors that may corrupt " - "images. OCRmyPDF will attempt to mitigate, but this version is " - "strongly not recommended. Please upgrade to a newer version. " - "As of 2025-12, 10.6.0 is the latest version of Ghostscript." - ) - if options.output_type == 'pdfa': - options.output_type = 'pdfa-2' + gs_version = ghostscript.version() + if gs_version in BLACKLISTED_GS_VERSIONS: + raise MissingDependencyError( + f"Ghostscript {gs_version} contains serious regressions and is not " + "supported. Please upgrade to a newer version." + ) + if Version('10.0.0') <= gs_version < Version('10.02.1') and ( + options.skip_text or options.redo_ocr + ): + raise MissingDependencyError( + f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) " + "contain serious regressions that corrupt PDFs with existing text, " + "such as those processed using --skip-text or --redo-ocr. " + "Please upgrade to a " + "newer version, or use --output-type pdf to avoid Ghostscript, or " + "use --force-ocr to discard existing text." + ) + if gs_version >= Version('10.6.0'): + log.warning( + "Ghostscript 10.6.x contains JPEG encoding errors that may corrupt " + "images. OCRmyPDF will attempt to mitigate, but this version is " + "strongly not recommended. Please upgrade to a newer version. " + "As of 2025-12, 10.6.0 is the latest version of Ghostscript." + ) + if options.output_type == 'pdfa': + options.output_type = 'pdfa-2' if ( options.ghostscript.color_conversion_strategy @@ -144,11 +147,11 @@ def check_options(options): ) if ( options.ghostscript.pdfa_image_compression != 'auto' - and not options.output_type.startswith('pdfa') + and options.output_type not in ('auto', 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3') ): log.warning( "--pdfa-image-compression argument only applies when " - "--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'" + "--output-type is 'auto' or one of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'" ) diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py index 74b34fdc..97ca156a 100644 --- a/src/ocrmypdf/cli.py +++ b/src/ocrmypdf/cli.py @@ -159,16 +159,17 @@ Online documentation is located at: ) parser.add_argument( '--output-type', - choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'], - default='pdfa', - help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for " - "long term archiving (default, recommended) but may not suitable " - "for users who want their file altered as little as possible. 'pdfa' " - "also has problems with full Unicode text. 'pdf' minimizes changes " - "to the input file. 'pdf-a1' creates a " - "PDF/A-1b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a " - "PDF/A-3b file. 'none' will produce no output, which may be helpful if " - "only the --sidecar is desired.", + choices=['auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'], + default='auto', + help="Choose output type. 'auto' (default) produces best-effort PDF/A " + "without requiring Ghostscript - uses verapdf validation when available, " + "otherwise passes through as PDF/A if safe (input already PDF/A or " + "force-ocr was used), or falls back to regular PDF. 'pdfa' creates a " + "PDF/A-2b compliant file for long term archiving (requires Ghostscript " + "as fallback). 'pdf' minimizes changes to the input file. 'pdfa-1' " + "creates a PDF/A-1b file. 'pdfa-2' is equivalent to 'pdfa'. 'pdfa-3' " + "creates a PDF/A-3b file. 'none' will produce no output, which may be " + "helpful if only the --sidecar is desired.", ) # Use null string '\0' as sentinel to indicate the user supplied no argument, diff --git a/tests/test_soft_error.py b/tests/test_soft_error.py index 01138c8c..5f5291f3 100644 --- a/tests/test_soft_error.py +++ b/tests/test_soft_error.py @@ -41,6 +41,8 @@ def test_render_continue_on_soft_error(resources, outpdf): exitcode = run_ocrmypdf_api( resources / 'francais.pdf', outpdf, + '--output-type', + 'pdfa', # Required to trigger Ghostscript PDF/A generation '--continue-on-soft-render-error', '--plugin', 'tests/plugins/tesseract_noop.py', @@ -55,6 +57,8 @@ def test_render_stop_on_soft_error(resources, outpdf): exitcode = run_ocrmypdf_api( resources / 'francais.pdf', outpdf, + '--output-type', + 'pdfa', # Required to trigger Ghostscript PDF/A generation '--plugin', 'tests/plugins/tesseract_noop.py', '--plugin',