Add 'auto' output type for best-effort PDF/A without Ghostscript

- Add new '--output-type auto' option (now the default) that produces
  best-effort PDF/A without requiring Ghostscript
- When verapdf is available, use speculative PDF/A conversion
- Without verapdf, pass through as PDF/A if safe (input claims PDF/A
  or --force-ocr was used), otherwise output as regular PDF
- Make Ghostscript check conditional - only required for pdfa* output types
- Update soft error tests to explicitly use --output-type pdfa since they
  exercise Ghostscript failure modes
- Fix Tesseract OSD error handling to check both stdout and stderr for
  known non-fatal messages like "Too few characters"
This commit is contained in:
James R. Barlow
2026-01-09 00:56:00 -08:00
parent bdc50e9470
commit 0c4ee5af4e
7 changed files with 153 additions and 51 deletions

View File

@@ -202,9 +202,11 @@ def get_orientation(
except CalledProcessError as e:
tesseract_log_output(e.stdout)
tesseract_log_output(e.stderr)
# Check both stdout (e.output) and stderr for known non-fatal messages
all_output = (e.output or b'') + (e.stderr or b'')
if (
b'Too few characters. Skipping this page' in e.output
or b'Image too large' in e.output
b'Too few characters. Skipping this page' in all_output
or b'Image too large' in all_output
):
return OrientationConfidence(0, 0)
raise SubprocessOutputError() from e

View File

@@ -88,7 +88,7 @@ class OCROptions(BaseModel):
# Core OCR options
languages: list[str] = Field(default_factory=lambda: [DEFAULT_LANGUAGE])
output_type: str = 'pdfa'
output_type: str = 'auto'
force_ocr: bool = False
skip_text: bool = False
redo_ocr: bool = False
@@ -190,7 +190,7 @@ class OCROptions(BaseModel):
@classmethod
def validate_output_type(cls, v):
"""Validate output type is one of the allowed values."""
valid_types = {'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}
valid_types = {'auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}
if v not in valid_types:
raise ValueError(f"output_type must be one of {valid_types}")
return v

View File

@@ -36,7 +36,11 @@ from ocrmypdf.exceptions import (
UnsupportedImageFormatError,
)
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution, safe_symlink
from ocrmypdf.pdfa import generate_pdfa_ps, speculative_pdfa_conversion
from ocrmypdf.pdfa import (
file_claims_pdfa,
generate_pdfa_ps,
speculative_pdfa_conversion,
)
from ocrmypdf.pdfinfo import Colorspace, Encoding, FloatRect, PageInfo, PdfInfo
from ocrmypdf.pluginspec import OrientationConfidence
@@ -991,6 +995,73 @@ def try_speculative_pdfa(input_pdf: Path, context: PdfContext) -> Path | None:
return None
def try_auto_pdfa(input_pdf: Path, context: PdfContext) -> tuple[Path, str]:
"""Best-effort PDF/A for 'auto' output type.
This function attempts to produce PDF/A without requiring Ghostscript:
1. If verapdf is available, tries speculative conversion with validation
2. Without verapdf, passes through as PDF/A if safe (input already PDF/A
or force-ocr was used)
3. Falls back to regular PDF if neither condition is met
Args:
input_pdf: Path to the PDF to convert
context: The PDF context
Returns:
Tuple of (output_path, actual_output_type) where actual_output_type
is 'pdfa' if PDF/A was achieved, 'pdf' otherwise
"""
from ocrmypdf._exec import verapdf
# If verapdf available, try speculative conversion with validation
if verapdf.available():
result = try_speculative_pdfa(input_pdf, context)
if result is not None:
return (result, 'pdfa')
# verapdf validation failed - fall through to regular PDF
log.info(
'Auto mode: speculative PDF/A validation failed, outputting regular PDF'
)
return (input_pdf, 'pdf')
# Without verapdf, check if we can pass through as PDF/A
if _is_safe_pdfa(input_pdf, context.options):
# Pass through as-is (no modifications needed)
log.info('Auto mode: passing through as PDF/A (input already compliant)')
return (input_pdf, 'pdfa')
# Fall through to regular PDF
log.info('Auto mode: no verapdf available and input is not PDF/A, outputting PDF')
return (input_pdf, 'pdf')
def _is_safe_pdfa(input_pdf: Path, options) -> bool:
"""Check if file can be considered PDF/A without validation.
These are cases where our modifications don't break PDF/A compliance:
1. Input already claims PDF/A (we just grafted OCR text onto it)
2. We used force-ocr (we rewrote the entire PDF from scratch)
Args:
input_pdf: Path to the PDF to check
options: OCR options
Returns:
True if file can safely be considered PDF/A
"""
# Safe if input already claims PDF/A
pdfa_status = file_claims_pdfa(input_pdf)
if pdfa_status['pass']:
return True
# Safe if we rewrote the PDF with force-ocr
if options.force_ocr:
return True
return False
def should_linearize(working_file: Path, context: PdfContext) -> bool:
"""Determine whether the PDF should be linearized.

View File

@@ -45,6 +45,7 @@ from ocrmypdf._pipeline import (
rasterize_preview,
should_linearize,
should_visible_page_image_use_jpg,
try_auto_pdfa,
try_speculative_pdfa,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
@@ -469,8 +470,13 @@ def postprocess(
pdf_out = fix_annots
else:
pdf_out = pdf_file
if context.options.output_type.startswith('pdfa'):
# Try speculative PDF/A conversion first (fast path using pikepdf + verapdf)
if context.options.output_type == 'auto':
# Best effort PDF/A - never uses Ghostscript
pdf_out, actual_type = try_auto_pdfa(pdf_out, context)
# Store actual output type for reporting
context.options.extra_attrs['_actual_output_type'] = actual_type
elif context.options.output_type.startswith('pdfa'):
# Required PDF/A - uses Ghostscript as fallback
speculative_result = try_speculative_pdfa(pdf_out, context)
if speculative_result is not None:
pdf_out = speculative_result
@@ -495,7 +501,22 @@ def report_output_pdf(options, start_input_file, optimize_messages) -> ExitCode:
elif samefile(options.output_file, Path(os.devnull)):
pass # Say nothing when sending to dev null
else:
if options.output_type.startswith('pdfa'):
if options.output_type == 'auto':
# For 'auto' mode, check what we actually produced
actual_type = options.extra_attrs.get('_actual_output_type', 'pdf')
pdfa_info = file_claims_pdfa(options.output_file)
if actual_type == 'pdfa' and pdfa_info['pass']:
log.info(
"Output file is a %s (auto mode achieved PDF/A)",
pdfa_info['conformance'],
)
elif pdfa_info['pass']:
# Unexpectedly got PDF/A
log.info("Output file is a %s", pdfa_info['conformance'])
else:
# Regular PDF - this is expected for auto mode fallback
log.info("Output file is a PDF (auto mode)")
elif options.output_type.startswith('pdfa'):
pdfa_info = file_claims_pdfa(options.output_file)
if pdfa_info['pass']:
log.info("Output file is a %s (as expected)", pdfa_info['conformance'])

View File

@@ -101,38 +101,41 @@ def add_options(parser):
@hookimpl
def check_options(options):
"""Check that the options are valid for this plugin."""
check_external_program(
program='gs',
package='ghostscript',
version_checker=ghostscript.version,
need_version='9.54', # RHEL 9's version; Ubuntu 22.04 has 9.55
)
gs_version = ghostscript.version()
if gs_version in BLACKLISTED_GS_VERSIONS:
raise MissingDependencyError(
f"Ghostscript {gs_version} contains serious regressions and is not "
"supported. Please upgrade to a newer version."
# Only require Ghostscript for pdfa* output types (not 'auto' or 'pdf')
# 'auto' mode uses best-effort PDF/A without Ghostscript fallback
if options.output_type.startswith('pdfa'):
check_external_program(
program='gs',
package='ghostscript',
version_checker=ghostscript.version,
need_version='9.54', # RHEL 9's version; Ubuntu 22.04 has 9.55
)
if Version('10.0.0') <= gs_version < Version('10.02.1') and (
options.skip_text or options.redo_ocr
):
raise MissingDependencyError(
f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) "
"contain serious regressions that corrupt PDFs with existing text, "
"such as those processed using --skip-text or --redo-ocr. "
"Please upgrade to a "
"newer version, or use --output-type pdf to avoid Ghostscript, or "
"use --force-ocr to discard existing text."
)
if gs_version >= Version('10.6.0') and options.output_type.startswith('pdfa'):
log.warning(
"Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
"images. OCRmyPDF will attempt to mitigate, but this version is "
"strongly not recommended. Please upgrade to a newer version. "
"As of 2025-12, 10.6.0 is the latest version of Ghostscript."
)
if options.output_type == 'pdfa':
options.output_type = 'pdfa-2'
gs_version = ghostscript.version()
if gs_version in BLACKLISTED_GS_VERSIONS:
raise MissingDependencyError(
f"Ghostscript {gs_version} contains serious regressions and is not "
"supported. Please upgrade to a newer version."
)
if Version('10.0.0') <= gs_version < Version('10.02.1') and (
options.skip_text or options.redo_ocr
):
raise MissingDependencyError(
f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) "
"contain serious regressions that corrupt PDFs with existing text, "
"such as those processed using --skip-text or --redo-ocr. "
"Please upgrade to a "
"newer version, or use --output-type pdf to avoid Ghostscript, or "
"use --force-ocr to discard existing text."
)
if gs_version >= Version('10.6.0'):
log.warning(
"Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
"images. OCRmyPDF will attempt to mitigate, but this version is "
"strongly not recommended. Please upgrade to a newer version. "
"As of 2025-12, 10.6.0 is the latest version of Ghostscript."
)
if options.output_type == 'pdfa':
options.output_type = 'pdfa-2'
if (
options.ghostscript.color_conversion_strategy
@@ -144,11 +147,11 @@ def check_options(options):
)
if (
options.ghostscript.pdfa_image_compression != 'auto'
and not options.output_type.startswith('pdfa')
and options.output_type not in ('auto', 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3')
):
log.warning(
"--pdfa-image-compression argument only applies when "
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
"--output-type is 'auto' or one of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'"
)

View File

@@ -159,16 +159,17 @@ Online documentation is located at:
)
parser.add_argument(
'--output-type',
choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],
default='pdfa',
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
"long term archiving (default, recommended) but may not suitable "
"for users who want their file altered as little as possible. 'pdfa' "
"also has problems with full Unicode text. 'pdf' minimizes changes "
"to the input file. 'pdf-a1' creates a "
"PDF/A-1b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
"PDF/A-3b file. 'none' will produce no output, which may be helpful if "
"only the --sidecar is desired.",
choices=['auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],
default='auto',
help="Choose output type. 'auto' (default) produces best-effort PDF/A "
"without requiring Ghostscript - uses verapdf validation when available, "
"otherwise passes through as PDF/A if safe (input already PDF/A or "
"force-ocr was used), or falls back to regular PDF. 'pdfa' creates a "
"PDF/A-2b compliant file for long term archiving (requires Ghostscript "
"as fallback). 'pdf' minimizes changes to the input file. 'pdfa-1' "
"creates a PDF/A-1b file. 'pdfa-2' is equivalent to 'pdfa'. 'pdfa-3' "
"creates a PDF/A-3b file. 'none' will produce no output, which may be "
"helpful if only the --sidecar is desired.",
)
# Use null string '\0' as sentinel to indicate the user supplied no argument,

View File

@@ -41,6 +41,8 @@ def test_render_continue_on_soft_error(resources, outpdf):
exitcode = run_ocrmypdf_api(
resources / 'francais.pdf',
outpdf,
'--output-type',
'pdfa', # Required to trigger Ghostscript PDF/A generation
'--continue-on-soft-render-error',
'--plugin',
'tests/plugins/tesseract_noop.py',
@@ -55,6 +57,8 @@ def test_render_stop_on_soft_error(resources, outpdf):
exitcode = run_ocrmypdf_api(
resources / 'francais.pdf',
outpdf,
'--output-type',
'pdfa', # Required to trigger Ghostscript PDF/A generation
'--plugin',
'tests/plugins/tesseract_noop.py',
'--plugin',