# SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 """Python API for OCRmyPDF. This module provides the main Python API for OCRmyPDF, allowing you to perform OCR operations programmatically without using the command line interface. Main Functions: ocr(): The primary function for OCR processing. Takes an input PDF or image file and produces an OCR'd PDF with searchable text. configure_logging(): Set up logging to match the command line interface behavior, with support for progress bars and colored output. Experimental Functions: _pdf_to_hocr(): Extract text from PDF pages and save as hOCR files for manual editing before final PDF generation. _hocr_to_ocr_pdf(): Convert hOCR files back to a searchable PDF after manual text corrections. The API maintains thread safety through internal locking since OCRmyPDF uses global state for plugins. Only one OCR operation can run per Python process at a time. For parallel processing, use multiple Python processes. Example: import ocrmypdf # Configure logging (optional) ocrmypdf.configure_logging(ocrmypdf.Verbosity.default) # Perform OCR ocrmypdf.ocr('input.pdf', 'output.pdf', language='eng') For detailed parameter documentation, see the ocr() function docstring and the equivalent command line parameters in the OCRmyPDF documentation. """ from __future__ import annotations import logging import os import sys import threading from collections.abc import Iterable, Sequence from enum import IntEnum from io import IOBase from pathlib import Path from typing import BinaryIO, overload from warnings import warn from ocrmypdf._logging import PageNumberFilter from ocrmypdf._options import OcrOptions from ocrmypdf._pipelines.hocr_to_ocr_pdf import run_hocr_to_ocr_pdf_pipeline from ocrmypdf._pipelines.ocr import run_pipeline, run_pipeline_cli from ocrmypdf._pipelines.pdf_to_hocr import run_hocr_pipeline from ocrmypdf._plugin_manager import OcrmypdfPluginManager, get_plugin_manager from ocrmypdf._validation import check_options from ocrmypdf.cli import ArgumentParser, get_parser from ocrmypdf.exceptions import ExitCode StrPath = Path | str | bytes PathOrIO = BinaryIO | StrPath # Installing plugins affects the global state of the Python interpreter, # so we need to use a lock to prevent multiple threads from installing # plugins at the same time. _api_lock = threading.Lock() def setup_plugin_infrastructure( plugins: Sequence[Path | str] | None = None, plugin_manager: OcrmypdfPluginManager | None = None, ) -> OcrmypdfPluginManager: """Set up plugin infrastructure with proper initialization. This function handles: 1. Creating or validating the plugin manager 2. Calling plugin initialization hooks 3. Setting up plugin option registry Args: plugins: List of plugin paths/names to load plugin_manager: Existing plugin manager (if any) Returns: Properly initialized plugin manager Raises: ValueError: If both plugins and plugin_manager are provided """ if plugins and plugin_manager: raise ValueError("plugins= and plugin_manager are mutually exclusive") if not plugins: plugins = [] elif isinstance(plugins, str | Path): plugins = [plugins] else: plugins = list(plugins) # Create plugin manager if not provided if not plugin_manager: plugin_manager = get_plugin_manager(plugins) # Initialize plugins (pass the underlying pluggy manager) plugin_manager.initialize(plugin_manager=plugin_manager.pluggy) # Initialize plugin option registry from ocrmypdf._plugin_registry import PluginOptionRegistry registry = PluginOptionRegistry() # Let plugins register their option models option_models = plugin_manager.register_options() all_plugin_models: dict[str, type] = {} for plugin_options in option_models: if plugin_options: # Skip None returns for namespace, model_class in plugin_options.items(): registry.register_option_model(namespace, model_class) all_plugin_models[namespace] = model_class # Register plugin models with OcrOptions for dynamic nested access OcrOptions.register_plugin_models(all_plugin_models) # Store registry in plugin manager for later access plugin_manager._option_registry = registry return plugin_manager class Verbosity(IntEnum): """Verbosity level for configure_logging.""" # pylint: disable=invalid-name quiet = -1 #: Suppress most messages default = 0 #: Default level of logging debug = 1 #: Output ocrmypdf debug messages debug_all = 2 #: More detailed debugging from ocrmypdf and dependent modules def configure_logging( verbosity: Verbosity, *, progress_bar_friendly: bool = True, manage_root_logger: bool = False, plugin_manager: OcrmypdfPluginManager | None = None, ): """Set up logging. Before calling :func:`ocrmypdf.ocr()`, you can use this function to configure logging if you want ocrmypdf's output to look like the ocrmypdf command line interface. It will register log handlers, log filters, and formatters, configure color logging to standard error, and adjust the log levels of third party libraries. Details of this are fine-tuned and subject to change. The ``verbosity`` argument is equivalent to the argument ``--verbose`` and applies those settings. If you have a wrapper script for ocrmypdf and you want it to be very similar to ocrmypdf, use this function; if you are using ocrmypdf as part of an application that manages its own logging, you probably do not want this function. If this function is not called, ocrmypdf will not configure logging, and it is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using the Python standard library's logging module. If this function is called, the caller may of course make further adjustments to logging. Regardless of whether this function is called, ocrmypdf will perform all of its logging under the ``"ocrmypdf"`` logging namespace. In addition, ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library user may wish to configure both; note that pdfminer is extremely chatty at the log level ``logging.INFO``. This function does not set up the ``debug.log`` log file that the command line interface does at certain verbosity levels. Applications should configure their own debug logging. Args: verbosity: Verbosity level. progress_bar_friendly: If True (the default), install a custom log handler that is compatible with progress bars and colored output. manage_root_logger: Configure the process's root logger. plugin_manager: The plugin manager, used for obtaining the custom log handler. Returns: The toplevel logger for ocrmypdf (or the root logger, if we are managing it). """ prefix = '' if manage_root_logger else 'ocrmypdf' log = logging.getLogger(prefix) log.setLevel(logging.DEBUG) console = None if plugin_manager and progress_bar_friendly: console = plugin_manager.get_logging_console() if not console: console = logging.StreamHandler(stream=sys.stderr) if verbosity < 0: console.setLevel(logging.ERROR) elif verbosity >= 1: console.setLevel(logging.DEBUG) else: console.setLevel(logging.INFO) console.addFilter(PageNumberFilter()) if verbosity >= 2: fmt = '%(levelname)7s %(name)s -%(pageno)s %(message)s' else: fmt = '%(pageno)s%(message)s' formatter = None if not formatter: formatter = logging.Formatter(fmt=fmt) console.setFormatter(formatter) log.addHandler(console) if verbosity <= 1: pdfminer_log = logging.getLogger('pdfminer') pdfminer_log.setLevel(logging.ERROR) pil_log = logging.getLogger('PIL') pil_log.setLevel(logging.INFO) fonttools_log = logging.getLogger('fontTools') fonttools_log.setLevel(logging.ERROR) if manage_root_logger: logging.captureWarnings(True) return log def _check_no_conflicting_ocr_params( locals_dict: dict, kwargs: dict, excluded: set[str] | None = None, ) -> None: """Check that no individual OCR parameters conflict with OcrOptions. When a user passes an OcrOptions object, they should not also pass individual OCR parameters (except plugins/plugin_manager which are handled separately). Args: locals_dict: The locals() dict from the calling function. kwargs: The **kwargs dict from the calling function. excluded: Parameter names to exclude from conflict checking. Raises: ValueError: If conflicting parameters are found. """ if excluded is None: excluded = set() # Parameters that are allowed alongside OcrOptions allowed_with_options = { 'input_file_or_options', 'options', # The OcrOptions object itself after assignment 'plugins', 'plugin_manager', 'kwargs', } | excluded # Check all locals that are OCR parameters (not None and not allowed) conflicts = [ name for name, value in locals_dict.items() if value is not None and name not in allowed_with_options ] # Check kwargs conflicts.extend(kwargs.keys()) if conflicts: raise ValueError( f"When passing OcrOptions as the first argument, do not pass " f"additional OCR parameters. Conflicting parameters: " f"{', '.join(sorted(conflicts))}. " f"Set these values in OcrOptions instead." ) def create_options( *, input_file: PathOrIO, output_file: PathOrIO, parser: ArgumentParser, **kwargs ) -> OcrOptions: """Construct an options object from the input/output files and keyword arguments. Args: input_file: Input file path or file object. output_file: Output file path or file object. parser: ArgumentParser object (kept for compatibility, may be used for plugin validation). **kwargs: Keyword arguments. Returns: OcrOptions: An options object containing the parsed arguments. Raises: TypeError: If the type of a keyword argument is not supported. """ # Prepare kwargs for direct OcrOptions construction options_kwargs = kwargs.copy() # Set input and output files options_kwargs['input_file'] = input_file options_kwargs['output_file'] = output_file # Handle special stream cases for sidecar if 'sidecar' in options_kwargs and isinstance( options_kwargs['sidecar'], BinaryIO | IOBase ): # Keep the stream object as-is - OcrOptions can handle it pass # Remove None values to let OcrOptions use its defaults options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None} # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs extra_attrs = {} ocr_fields = set(OcrOptions.model_fields.keys()) # Legacy mode flags are handled by OcrOptions model validator legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'} # Known extra attributes that should be preserved known_extra = {'progress_bar', 'plugins'} for key in list(options_kwargs.keys()): if key in ocr_fields or key in legacy_mode_flags or key in known_extra: continue extra_attrs[key] = options_kwargs.pop(key) # Create OcrOptions directly try: options = OcrOptions(**options_kwargs) # Add any extra attributes if extra_attrs: options.extra_attrs.update(extra_attrs) return options except Exception as e: # If direct construction fails, provide a helpful error message raise TypeError(f"Failed to create OcrOptions: {e}") from e @overload def ocr( options: OcrOptions, /, *, plugins: Iterable[Path | str] | None = None, plugin_manager: OcrmypdfPluginManager | None = None, ) -> ExitCode: ... @overload def ocr( input_file_or_options: PathOrIO, output_file: PathOrIO, *, language: Iterable[str] | None = None, image_dpi: int | None = None, output_type: str | None = None, sidecar: PathOrIO | None = None, jobs: int | None = None, use_threads: bool | None = None, title: str | None = None, author: str | None = None, subject: str | None = None, keywords: str | None = None, rotate_pages: bool | None = None, remove_background: bool | None = None, deskew: bool | None = None, clean: bool | None = None, clean_final: bool | None = None, unpaper_args: str | None = None, oversample: int | None = None, remove_vectors: bool | None = None, mode: str | None = None, force_ocr: bool | None = None, skip_text: bool | None = None, redo_ocr: bool | None = None, skip_big: float | None = None, optimize: int | None = None, jpg_quality: int | None = None, png_quality: int | None = None, jbig2_lossy: bool | None = None, jbig2_page_group_size: int | None = None, jbig2_threshold: float | None = None, pages: str | None = None, max_image_mpixels: float | None = None, tesseract_config: Iterable[str] | None = None, tesseract_pagesegmode: int | None = None, tesseract_oem: int | None = None, tesseract_thresholding: int | None = None, pdf_renderer: str | None = None, rasterizer: str | None = None, tesseract_timeout: float | None = None, tesseract_non_ocr_timeout: float | None = None, tesseract_downsample_above: int | None = None, tesseract_downsample_large_images: bool | None = None, rotate_pages_threshold: float | None = None, pdfa_image_compression: str | None = None, color_conversion_strategy: str | None = None, user_words: os.PathLike | None = None, user_patterns: os.PathLike | None = None, fast_web_view: float | None = None, continue_on_soft_render_error: bool | None = None, invalidate_digital_signatures: bool | None = None, plugins: Iterable[Path | str] | None = None, plugin_manager: OcrmypdfPluginManager | None = None, keep_temporary_files: bool | None = None, progress_bar: bool | None = None, **kwargs, ) -> ExitCode: ... def ocr( # noqa: D417 input_file_or_options: PathOrIO | OcrOptions, output_file: PathOrIO | None = None, *, language: Iterable[str] | None = None, image_dpi: int | None = None, output_type: str | None = None, sidecar: PathOrIO | None = None, jobs: int | None = None, use_threads: bool | None = None, title: str | None = None, author: str | None = None, subject: str | None = None, keywords: str | None = None, rotate_pages: bool | None = None, remove_background: bool | None = None, deskew: bool | None = None, clean: bool | None = None, clean_final: bool | None = None, unpaper_args: str | None = None, oversample: int | None = None, remove_vectors: bool | None = None, mode: str | None = None, force_ocr: bool | None = None, # Legacy, use mode='force' instead skip_text: bool | None = None, # Legacy, use mode='skip' instead redo_ocr: bool | None = None, # Legacy, use mode='redo' instead skip_big: float | None = None, optimize: int | None = None, jpg_quality: int | None = None, png_quality: int | None = None, jbig2_lossy: bool | None = None, # Deprecated, ignored jbig2_page_group_size: int | None = None, # Deprecated, ignored jbig2_threshold: float | None = None, pages: str | None = None, max_image_mpixels: float | None = None, tesseract_config: Iterable[str] | None = None, tesseract_pagesegmode: int | None = None, tesseract_oem: int | None = None, tesseract_thresholding: int | None = None, pdf_renderer: str | None = None, rasterizer: str | None = None, tesseract_timeout: float | None = None, tesseract_non_ocr_timeout: float | None = None, tesseract_downsample_above: int | None = None, tesseract_downsample_large_images: bool | None = None, rotate_pages_threshold: float | None = None, pdfa_image_compression: str | None = None, color_conversion_strategy: str | None = None, user_words: os.PathLike | None = None, user_patterns: os.PathLike | None = None, fast_web_view: float | None = None, continue_on_soft_render_error: bool | None = None, invalidate_digital_signatures: bool | None = None, plugins: Iterable[Path | str] | None = None, plugin_manager: OcrmypdfPluginManager | None = None, keep_temporary_files: bool | None = None, progress_bar: bool | None = None, **kwargs, ) -> ExitCode: """Run OCRmyPDF on one PDF or image. This function supports two calling conventions: **New style (recommended):** >>> from ocrmypdf import ocr >>> from ocrmypdf._options import OcrOptions >>> options = OcrOptions( ... input_file="input.pdf", ... output_file="output.pdf", ... languages=["eng"], ... ) >>> ocr(options) **Old style:** >>> ocr("input.pdf", "output.pdf", language=["eng"]) For most arguments, see documentation for the equivalent command line parameter. This API takes a threading lock, because OCRmyPDF uses global state in particular for the plugin system. The jobs parameter will be used to create a pool of worker threads or processes at different times, subject to change. A Python process can only run one OCRmyPDF task at a time. To run parallelize instances OCRmyPDF, use separate Python processes to scale horizontally. Generally speaking you should set jobs=sqrt(cpu_count) and run sqrt(cpu_count) processes as a starting point. If you have files with a high page count, run fewer processes and more jobs per process. If you have a lot of short files, run more processes and fewer jobs per process. A few specific arguments are discussed here: Args: input_file_or_options: Either an OcrOptions object containing all settings, or a path/stream for the input file (old-style API). output_file: Output file path or stream. Required when using old-style API with input_file as first argument. Must be None when passing OcrOptions. use_threads: Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. plugins: List of plugin paths to load. Can be passed alongside OcrOptions. plugin_manager: Pre-configured plugin manager. Can be passed alongside OcrOptions. For input_file (old-style API): If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the input file. If the object appears to be a readable stream (with methods such as ``.read()`` and ``.seek()``), the object will be read in its entirety and saved to a temporary file. If ``input_file`` is ``"-"``, standard input will be read. For output_file (old-style API): If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the output file. If the object appears to be a writable stream (with methods such as ``.write()`` and ``.seek()``), the output will be written to this stream. If ``output_file`` is ``"-"``, the output will be written to ``sys.stdout`` (provided that standard output does not seem to be a terminal device). When a stream is used as output, whether via a writable object or ``"-"``, some final validation steps are not performed (we do not read back the stream after it is written). Raises: ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfError: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. ValueError: If OcrOptions is passed along with other OCR parameters, or if both plugins and plugin_manager are provided. TypeError: If output_file is missing when using the old-style API. Returns: :class:`ocrmypdf.ExitCode` """ # Detect calling convention: OcrOptions object vs individual parameters if isinstance(input_file_or_options, OcrOptions): # New-style API: OcrOptions passed directly options = input_file_or_options # Check for conflicting parameters (all should be None except plugins/plugin_manager) _check_no_conflicting_ocr_params(locals(), kwargs) # plugins and plugin_manager can still be passed alongside OcrOptions if plugins and plugin_manager: raise ValueError("plugins= and plugin_manager are mutually exclusive") # Use plugins from OcrOptions if not explicitly passed if plugins is None: plugins = options.plugins or [] if isinstance(plugins, str | Path): plugins = [plugins] else: plugins = list(plugins) if plugins else [] # Run the pipeline with the OcrOptions with _api_lock: plugin_manager = setup_plugin_infrastructure( plugins=plugins, plugin_manager=plugin_manager ) parser = get_parser() plugin_manager.add_options(parser=parser) check_options(options, plugin_manager) return run_pipeline(options=options, plugin_manager=plugin_manager) else: # Old-style API: positional arguments input_file = input_file_or_options if output_file is None: raise TypeError( "ocr() missing required argument: 'output_file'. " "Either pass output_file as the second argument, or pass " "an OcrOptions object as the first argument." ) if plugins and plugin_manager: raise ValueError("plugins= and plugin_manager are mutually exclusive") if not plugins: plugins = [] elif isinstance(plugins, str | Path): plugins = [plugins] else: plugins = list(plugins) # No new variable names should be assigned until these two steps are run create_options_kwargs = { k: v for k, v in locals().items() if k not in { 'input_file_or_options', 'input_file', 'output_file', 'kwargs', 'plugin_manager', } } create_options_kwargs.update(kwargs) parser = get_parser() with _api_lock: # Set up plugin infrastructure with proper initialization plugin_manager = setup_plugin_infrastructure( plugins=plugins, plugin_manager=plugin_manager ) # Get parser and let plugins add their options parser = get_parser() plugin_manager.add_options(parser=parser) if 'verbose' in kwargs: warn( "ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging()." ) # Warn about deprecated jbig2 options and remove from kwargs if jbig2_lossy: warn( "jbig2_lossy is deprecated and will be ignored. " "Lossy JBIG2 has been removed due to character substitution risks." ) create_options_kwargs.pop('jbig2_lossy', None) if jbig2_page_group_size: warn("jbig2_page_group_size is deprecated and will be ignored.") create_options_kwargs.pop('jbig2_page_group_size', None) options = create_options( input_file=input_file, output_file=output_file, parser=parser, **create_options_kwargs, ) check_options(options, plugin_manager) return run_pipeline(options=options, plugin_manager=plugin_manager) def _pdf_to_hocr( # noqa: D417 input_pdf: Path, output_folder: Path, *, language: Iterable[str] | None = None, image_dpi: int | None = None, jobs: int | None = None, use_threads: bool | None = None, title: str | None = None, author: str | None = None, subject: str | None = None, keywords: str | None = None, rotate_pages: bool | None = None, remove_background: bool | None = None, deskew: bool | None = None, clean: bool | None = None, clean_final: bool | None = None, unpaper_args: str | None = None, oversample: int | None = None, remove_vectors: bool | None = None, mode: str | None = None, force_ocr: bool | None = None, # Legacy, use mode='force' instead skip_text: bool | None = None, # Legacy, use mode='skip' instead redo_ocr: bool | None = None, # Legacy, use mode='redo' instead skip_big: float | None = None, pages: str | None = None, max_image_mpixels: float | None = None, tesseract_config: Iterable[str] | None = None, tesseract_pagesegmode: int | None = None, tesseract_oem: int | None = None, tesseract_thresholding: int | None = None, tesseract_timeout: float | None = None, tesseract_non_ocr_timeout: float | None = None, tesseract_downsample_above: int | None = None, tesseract_downsample_large_images: bool | None = None, rotate_pages_threshold: float | None = None, rasterizer: str | None = None, user_words: os.PathLike | None = None, user_patterns: os.PathLike | None = None, continue_on_soft_render_error: bool | None = None, invalidate_digital_signatures: bool | None = None, plugin_manager=None, plugins: Sequence[Path | str] | None = None, keep_temporary_files: bool | None = None, **kwargs, ): """Partially run OCRmyPDF and produces an output folder containing hOCR files. Given a PDF file, this function will run OCRmyPDF up to the point where the PDF is rasterized to images, OCRed, and the hOCR files are produced, all of which are saved to the output folder. This is useful for applications that want to provide an interface for users to edit the text before rendering the final PDF. Use :func:`hocr_to_ocr_pdf` to produce the final PDF. For arguments not explicitly documented here, see documentation for the equivalent command line parameter. This API is **experimental** and subject to change. Args: input_pdf: Input PDF file path. output_folder: Output folder path. **kwargs: Keyword arguments. """ if plugins and plugin_manager: raise ValueError("plugins= and plugin_manager are mutually exclusive") if not plugins: plugins = [] elif isinstance(plugins, str | Path): plugins = [plugins] else: plugins = list(plugins) # Prepare kwargs for direct OcrOptions construction options_kwargs = kwargs.copy() # Set input file and handle special output_folder case options_kwargs['input_file'] = input_pdf options_kwargs['output_file'] = '/dev/null' # Placeholder for hOCR pipeline # Add all the function parameters for param_name, param_value in locals().items(): if ( param_name not in {'input_pdf', 'output_folder', 'kwargs', 'plugin_manager', 'plugins'} and param_value is not None ): options_kwargs[param_name] = param_value # Handle plugins if plugins: options_kwargs['plugins'] = plugins # Remove None values to let OcrOptions use its defaults options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None} # Add output_folder to options_kwargs since it's now a proper field options_kwargs['output_folder'] = output_folder # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs extra_attrs = {} ocr_fields = set(OcrOptions.model_fields.keys()) # Legacy mode flags are handled by OcrOptions model validator legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'} known_extra = {'progress_bar', 'plugins'} for key in list(options_kwargs.keys()): if key in ocr_fields or key in legacy_mode_flags or key in known_extra: continue extra_attrs[key] = options_kwargs.pop(key) with _api_lock: # Set up plugin infrastructure with proper initialization plugin_manager = setup_plugin_infrastructure( plugins=plugins, plugin_manager=plugin_manager ) plugin_manager.add_options(parser=get_parser()) # Create OcrOptions directly try: options = OcrOptions(**options_kwargs) # Add any extra attributes if extra_attrs: options.extra_attrs.update(extra_attrs) except Exception as e: raise TypeError( f"Failed to create OcrOptions for hOCR pipeline: {e}" ) from e return run_hocr_pipeline(options=options, plugin_manager=plugin_manager) def _hocr_to_ocr_pdf( # noqa: D417 work_folder: Path, output_file: Path, *, jobs: int | None = None, use_threads: bool | None = None, optimize: int | None = None, jpg_quality: int | None = None, png_quality: int | None = None, jbig2_lossy: bool | None = None, # Deprecated, ignored jbig2_page_group_size: int | None = None, # Deprecated, ignored jbig2_threshold: float | None = None, pdfa_image_compression: str | None = None, color_conversion_strategy: str | None = None, fast_web_view: float | None = None, plugin_manager=None, plugins: Sequence[Path | str] | None = None, **kwargs, ): """Run OCRmyPDF on a work folder and produce an output PDF. After running :func:`pdf_to_hocr`, this function will run OCRmyPDF on the work folder to produce an output PDF. This function consolidates any changes made to the hOCR files in the work folder and produces a final PDF. For arguments not explicitly documented here, see documentation for the equivalent command line parameter. This API is **experimental** and subject to change. Args: work_folder: Work folder path, as generated by :func:`pdf_to_hocr`. output_file: Output PDF file path. **kwargs: Keyword arguments. """ if plugins and plugin_manager: raise ValueError("plugins= and plugin_manager are mutually exclusive") if not plugins: plugins = [] elif isinstance(plugins, str | Path): plugins = [plugins] else: plugins = list(plugins) # Prepare kwargs for direct OcrOptions construction options_kwargs = kwargs.copy() # Set output file and handle special work_folder case options_kwargs['input_file'] = '/dev/null' # Placeholder for hOCR to PDF pipeline options_kwargs['output_file'] = output_file # Add all the function parameters for param_name, param_value in locals().items(): if ( param_name not in {'work_folder', 'output_file', 'kwargs', 'plugin_manager', 'plugins'} and param_value is not None ): options_kwargs[param_name] = param_value # Handle plugins if plugins: options_kwargs['plugins'] = plugins # Remove None values to let OcrOptions use its defaults options_kwargs = {k: v for k, v in options_kwargs.items() if v is not None} # Warn about deprecated jbig2 options and remove from kwargs if jbig2_lossy: warn( "jbig2_lossy is deprecated and will be ignored. " "Lossy JBIG2 has been removed due to character substitution risks." ) options_kwargs.pop('jbig2_lossy', None) if jbig2_page_group_size: warn("jbig2_page_group_size is deprecated and will be ignored.") options_kwargs.pop('jbig2_page_group_size', None) # Add work_folder to options_kwargs since it's now a proper field options_kwargs['work_folder'] = work_folder # Remove any kwargs that aren't OcrOptions fields and store in extra_attrs extra_attrs = {} ocr_fields = set(OcrOptions.model_fields.keys()) # Legacy mode flags are handled by OcrOptions model validator legacy_mode_flags = {'force_ocr', 'skip_text', 'redo_ocr'} known_extra = {'progress_bar', 'plugins'} for key in list(options_kwargs.keys()): if key in ocr_fields or key in legacy_mode_flags or key in known_extra: continue extra_attrs[key] = options_kwargs.pop(key) with _api_lock: # Set up plugin infrastructure with proper initialization plugin_manager = setup_plugin_infrastructure( plugins=plugins, plugin_manager=plugin_manager ) plugin_manager.add_options(parser=get_parser()) # Create OcrOptions directly try: options = OcrOptions(**options_kwargs) # Add any extra attributes if extra_attrs: options.extra_attrs.update(extra_attrs) except Exception as e: raise TypeError( f"Failed to create OcrOptions for hOCR to PDF pipeline: {e}" ) from e return run_hocr_to_ocr_pdf_pipeline( options=options, plugin_manager=plugin_manager ) __all__ = [ 'PageNumberFilter', 'Verbosity', 'check_options', 'configure_logging', 'create_options', 'get_parser', 'get_plugin_manager', 'ocr', 'run_pipeline', 'run_pipeline_cli', 'setup_plugin_infrastructure', ]