mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
Add 'auto' output type for best-effort PDF/A without Ghostscript
- Add new '--output-type auto' option (now the default) that produces best-effort PDF/A without requiring Ghostscript - When verapdf is available, use speculative PDF/A conversion - Without verapdf, pass through as PDF/A if safe (input claims PDF/A or --force-ocr was used), otherwise output as regular PDF - Make Ghostscript check conditional - only required for pdfa* output types - Update soft error tests to explicitly use --output-type pdfa since they exercise Ghostscript failure modes - Fix Tesseract OSD error handling to check both stdout and stderr for known non-fatal messages like "Too few characters"
This commit is contained in:
@@ -202,9 +202,11 @@ def get_orientation(
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(e.stdout)
|
||||
tesseract_log_output(e.stderr)
|
||||
# Check both stdout (e.output) and stderr for known non-fatal messages
|
||||
all_output = (e.output or b'') + (e.stderr or b'')
|
||||
if (
|
||||
b'Too few characters. Skipping this page' in e.output
|
||||
or b'Image too large' in e.output
|
||||
b'Too few characters. Skipping this page' in all_output
|
||||
or b'Image too large' in all_output
|
||||
):
|
||||
return OrientationConfidence(0, 0)
|
||||
raise SubprocessOutputError() from e
|
||||
|
||||
@@ -88,7 +88,7 @@ class OCROptions(BaseModel):
|
||||
|
||||
# Core OCR options
|
||||
languages: list[str] = Field(default_factory=lambda: [DEFAULT_LANGUAGE])
|
||||
output_type: str = 'pdfa'
|
||||
output_type: str = 'auto'
|
||||
force_ocr: bool = False
|
||||
skip_text: bool = False
|
||||
redo_ocr: bool = False
|
||||
@@ -190,7 +190,7 @@ class OCROptions(BaseModel):
|
||||
@classmethod
|
||||
def validate_output_type(cls, v):
|
||||
"""Validate output type is one of the allowed values."""
|
||||
valid_types = {'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}
|
||||
valid_types = {'auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'}
|
||||
if v not in valid_types:
|
||||
raise ValueError(f"output_type must be one of {valid_types}")
|
||||
return v
|
||||
|
||||
@@ -36,7 +36,11 @@ from ocrmypdf.exceptions import (
|
||||
UnsupportedImageFormatError,
|
||||
)
|
||||
from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution, safe_symlink
|
||||
from ocrmypdf.pdfa import generate_pdfa_ps, speculative_pdfa_conversion
|
||||
from ocrmypdf.pdfa import (
|
||||
file_claims_pdfa,
|
||||
generate_pdfa_ps,
|
||||
speculative_pdfa_conversion,
|
||||
)
|
||||
from ocrmypdf.pdfinfo import Colorspace, Encoding, FloatRect, PageInfo, PdfInfo
|
||||
from ocrmypdf.pluginspec import OrientationConfidence
|
||||
|
||||
@@ -991,6 +995,73 @@ def try_speculative_pdfa(input_pdf: Path, context: PdfContext) -> Path | None:
|
||||
return None
|
||||
|
||||
|
||||
def try_auto_pdfa(input_pdf: Path, context: PdfContext) -> tuple[Path, str]:
|
||||
"""Best-effort PDF/A for 'auto' output type.
|
||||
|
||||
This function attempts to produce PDF/A without requiring Ghostscript:
|
||||
1. If verapdf is available, tries speculative conversion with validation
|
||||
2. Without verapdf, passes through as PDF/A if safe (input already PDF/A
|
||||
or force-ocr was used)
|
||||
3. Falls back to regular PDF if neither condition is met
|
||||
|
||||
Args:
|
||||
input_pdf: Path to the PDF to convert
|
||||
context: The PDF context
|
||||
|
||||
Returns:
|
||||
Tuple of (output_path, actual_output_type) where actual_output_type
|
||||
is 'pdfa' if PDF/A was achieved, 'pdf' otherwise
|
||||
"""
|
||||
from ocrmypdf._exec import verapdf
|
||||
|
||||
# If verapdf available, try speculative conversion with validation
|
||||
if verapdf.available():
|
||||
result = try_speculative_pdfa(input_pdf, context)
|
||||
if result is not None:
|
||||
return (result, 'pdfa')
|
||||
# verapdf validation failed - fall through to regular PDF
|
||||
log.info(
|
||||
'Auto mode: speculative PDF/A validation failed, outputting regular PDF'
|
||||
)
|
||||
return (input_pdf, 'pdf')
|
||||
|
||||
# Without verapdf, check if we can pass through as PDF/A
|
||||
if _is_safe_pdfa(input_pdf, context.options):
|
||||
# Pass through as-is (no modifications needed)
|
||||
log.info('Auto mode: passing through as PDF/A (input already compliant)')
|
||||
return (input_pdf, 'pdfa')
|
||||
|
||||
# Fall through to regular PDF
|
||||
log.info('Auto mode: no verapdf available and input is not PDF/A, outputting PDF')
|
||||
return (input_pdf, 'pdf')
|
||||
|
||||
|
||||
def _is_safe_pdfa(input_pdf: Path, options) -> bool:
|
||||
"""Check if file can be considered PDF/A without validation.
|
||||
|
||||
These are cases where our modifications don't break PDF/A compliance:
|
||||
1. Input already claims PDF/A (we just grafted OCR text onto it)
|
||||
2. We used force-ocr (we rewrote the entire PDF from scratch)
|
||||
|
||||
Args:
|
||||
input_pdf: Path to the PDF to check
|
||||
options: OCR options
|
||||
|
||||
Returns:
|
||||
True if file can safely be considered PDF/A
|
||||
"""
|
||||
# Safe if input already claims PDF/A
|
||||
pdfa_status = file_claims_pdfa(input_pdf)
|
||||
if pdfa_status['pass']:
|
||||
return True
|
||||
|
||||
# Safe if we rewrote the PDF with force-ocr
|
||||
if options.force_ocr:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def should_linearize(working_file: Path, context: PdfContext) -> bool:
|
||||
"""Determine whether the PDF should be linearized.
|
||||
|
||||
|
||||
@@ -45,6 +45,7 @@ from ocrmypdf._pipeline import (
|
||||
rasterize_preview,
|
||||
should_linearize,
|
||||
should_visible_page_image_use_jpg,
|
||||
try_auto_pdfa,
|
||||
try_speculative_pdfa,
|
||||
)
|
||||
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
|
||||
@@ -469,8 +470,13 @@ def postprocess(
|
||||
pdf_out = fix_annots
|
||||
else:
|
||||
pdf_out = pdf_file
|
||||
if context.options.output_type.startswith('pdfa'):
|
||||
# Try speculative PDF/A conversion first (fast path using pikepdf + verapdf)
|
||||
if context.options.output_type == 'auto':
|
||||
# Best effort PDF/A - never uses Ghostscript
|
||||
pdf_out, actual_type = try_auto_pdfa(pdf_out, context)
|
||||
# Store actual output type for reporting
|
||||
context.options.extra_attrs['_actual_output_type'] = actual_type
|
||||
elif context.options.output_type.startswith('pdfa'):
|
||||
# Required PDF/A - uses Ghostscript as fallback
|
||||
speculative_result = try_speculative_pdfa(pdf_out, context)
|
||||
if speculative_result is not None:
|
||||
pdf_out = speculative_result
|
||||
@@ -495,7 +501,22 @@ def report_output_pdf(options, start_input_file, optimize_messages) -> ExitCode:
|
||||
elif samefile(options.output_file, Path(os.devnull)):
|
||||
pass # Say nothing when sending to dev null
|
||||
else:
|
||||
if options.output_type.startswith('pdfa'):
|
||||
if options.output_type == 'auto':
|
||||
# For 'auto' mode, check what we actually produced
|
||||
actual_type = options.extra_attrs.get('_actual_output_type', 'pdf')
|
||||
pdfa_info = file_claims_pdfa(options.output_file)
|
||||
if actual_type == 'pdfa' and pdfa_info['pass']:
|
||||
log.info(
|
||||
"Output file is a %s (auto mode achieved PDF/A)",
|
||||
pdfa_info['conformance'],
|
||||
)
|
||||
elif pdfa_info['pass']:
|
||||
# Unexpectedly got PDF/A
|
||||
log.info("Output file is a %s", pdfa_info['conformance'])
|
||||
else:
|
||||
# Regular PDF - this is expected for auto mode fallback
|
||||
log.info("Output file is a PDF (auto mode)")
|
||||
elif options.output_type.startswith('pdfa'):
|
||||
pdfa_info = file_claims_pdfa(options.output_file)
|
||||
if pdfa_info['pass']:
|
||||
log.info("Output file is a %s (as expected)", pdfa_info['conformance'])
|
||||
|
||||
@@ -101,38 +101,41 @@ def add_options(parser):
|
||||
@hookimpl
|
||||
def check_options(options):
|
||||
"""Check that the options are valid for this plugin."""
|
||||
check_external_program(
|
||||
program='gs',
|
||||
package='ghostscript',
|
||||
version_checker=ghostscript.version,
|
||||
need_version='9.54', # RHEL 9's version; Ubuntu 22.04 has 9.55
|
||||
)
|
||||
gs_version = ghostscript.version()
|
||||
if gs_version in BLACKLISTED_GS_VERSIONS:
|
||||
raise MissingDependencyError(
|
||||
f"Ghostscript {gs_version} contains serious regressions and is not "
|
||||
"supported. Please upgrade to a newer version."
|
||||
# Only require Ghostscript for pdfa* output types (not 'auto' or 'pdf')
|
||||
# 'auto' mode uses best-effort PDF/A without Ghostscript fallback
|
||||
if options.output_type.startswith('pdfa'):
|
||||
check_external_program(
|
||||
program='gs',
|
||||
package='ghostscript',
|
||||
version_checker=ghostscript.version,
|
||||
need_version='9.54', # RHEL 9's version; Ubuntu 22.04 has 9.55
|
||||
)
|
||||
if Version('10.0.0') <= gs_version < Version('10.02.1') and (
|
||||
options.skip_text or options.redo_ocr
|
||||
):
|
||||
raise MissingDependencyError(
|
||||
f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) "
|
||||
"contain serious regressions that corrupt PDFs with existing text, "
|
||||
"such as those processed using --skip-text or --redo-ocr. "
|
||||
"Please upgrade to a "
|
||||
"newer version, or use --output-type pdf to avoid Ghostscript, or "
|
||||
"use --force-ocr to discard existing text."
|
||||
)
|
||||
if gs_version >= Version('10.6.0') and options.output_type.startswith('pdfa'):
|
||||
log.warning(
|
||||
"Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
|
||||
"images. OCRmyPDF will attempt to mitigate, but this version is "
|
||||
"strongly not recommended. Please upgrade to a newer version. "
|
||||
"As of 2025-12, 10.6.0 is the latest version of Ghostscript."
|
||||
)
|
||||
if options.output_type == 'pdfa':
|
||||
options.output_type = 'pdfa-2'
|
||||
gs_version = ghostscript.version()
|
||||
if gs_version in BLACKLISTED_GS_VERSIONS:
|
||||
raise MissingDependencyError(
|
||||
f"Ghostscript {gs_version} contains serious regressions and is not "
|
||||
"supported. Please upgrade to a newer version."
|
||||
)
|
||||
if Version('10.0.0') <= gs_version < Version('10.02.1') and (
|
||||
options.skip_text or options.redo_ocr
|
||||
):
|
||||
raise MissingDependencyError(
|
||||
f"Ghostscript 10.0.0 through 10.02.0 (your version: {gs_version}) "
|
||||
"contain serious regressions that corrupt PDFs with existing text, "
|
||||
"such as those processed using --skip-text or --redo-ocr. "
|
||||
"Please upgrade to a "
|
||||
"newer version, or use --output-type pdf to avoid Ghostscript, or "
|
||||
"use --force-ocr to discard existing text."
|
||||
)
|
||||
if gs_version >= Version('10.6.0'):
|
||||
log.warning(
|
||||
"Ghostscript 10.6.x contains JPEG encoding errors that may corrupt "
|
||||
"images. OCRmyPDF will attempt to mitigate, but this version is "
|
||||
"strongly not recommended. Please upgrade to a newer version. "
|
||||
"As of 2025-12, 10.6.0 is the latest version of Ghostscript."
|
||||
)
|
||||
if options.output_type == 'pdfa':
|
||||
options.output_type = 'pdfa-2'
|
||||
|
||||
if (
|
||||
options.ghostscript.color_conversion_strategy
|
||||
@@ -144,11 +147,11 @@ def check_options(options):
|
||||
)
|
||||
if (
|
||||
options.ghostscript.pdfa_image_compression != 'auto'
|
||||
and not options.output_type.startswith('pdfa')
|
||||
and options.output_type not in ('auto', 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3')
|
||||
):
|
||||
log.warning(
|
||||
"--pdfa-image-compression argument only applies when "
|
||||
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
|
||||
"--output-type is 'auto' or one of 'pdfa', 'pdfa-1', 'pdfa-2', 'pdfa-3'"
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -159,16 +159,17 @@ Online documentation is located at:
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-type',
|
||||
choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],
|
||||
default='pdfa',
|
||||
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
|
||||
"long term archiving (default, recommended) but may not suitable "
|
||||
"for users who want their file altered as little as possible. 'pdfa' "
|
||||
"also has problems with full Unicode text. 'pdf' minimizes changes "
|
||||
"to the input file. 'pdf-a1' creates a "
|
||||
"PDF/A-1b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
|
||||
"PDF/A-3b file. 'none' will produce no output, which may be helpful if "
|
||||
"only the --sidecar is desired.",
|
||||
choices=['auto', 'pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3', 'none'],
|
||||
default='auto',
|
||||
help="Choose output type. 'auto' (default) produces best-effort PDF/A "
|
||||
"without requiring Ghostscript - uses verapdf validation when available, "
|
||||
"otherwise passes through as PDF/A if safe (input already PDF/A or "
|
||||
"force-ocr was used), or falls back to regular PDF. 'pdfa' creates a "
|
||||
"PDF/A-2b compliant file for long term archiving (requires Ghostscript "
|
||||
"as fallback). 'pdf' minimizes changes to the input file. 'pdfa-1' "
|
||||
"creates a PDF/A-1b file. 'pdfa-2' is equivalent to 'pdfa'. 'pdfa-3' "
|
||||
"creates a PDF/A-3b file. 'none' will produce no output, which may be "
|
||||
"helpful if only the --sidecar is desired.",
|
||||
)
|
||||
|
||||
# Use null string '\0' as sentinel to indicate the user supplied no argument,
|
||||
|
||||
@@ -41,6 +41,8 @@ def test_render_continue_on_soft_error(resources, outpdf):
|
||||
exitcode = run_ocrmypdf_api(
|
||||
resources / 'francais.pdf',
|
||||
outpdf,
|
||||
'--output-type',
|
||||
'pdfa', # Required to trigger Ghostscript PDF/A generation
|
||||
'--continue-on-soft-render-error',
|
||||
'--plugin',
|
||||
'tests/plugins/tesseract_noop.py',
|
||||
@@ -55,6 +57,8 @@ def test_render_stop_on_soft_error(resources, outpdf):
|
||||
exitcode = run_ocrmypdf_api(
|
||||
resources / 'francais.pdf',
|
||||
outpdf,
|
||||
'--output-type',
|
||||
'pdfa', # Required to trigger Ghostscript PDF/A generation
|
||||
'--plugin',
|
||||
'tests/plugins/tesseract_noop.py',
|
||||
'--plugin',
|
||||
|
||||
Reference in New Issue
Block a user