From e02f6c1e97c4353834f7c982ec2d79c15b60aef7 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Sat, 2 May 2020 03:34:31 -0700
Subject: [PATCH] Support plugin invocation with API

---
 src/ocrmypdf/__main__.py        |  11 +-
 src/ocrmypdf/_jobcontext.py     |  12 +-
 src/ocrmypdf/_pipeline.py       |   4 +-
 src/ocrmypdf/_plugin_manager.py |   6 +-
 src/ocrmypdf/_sync.py           |   9 +-
 src/ocrmypdf/api.py             |  32 +-
 src/ocrmypdf/cli.py             | 791 ++++++++++++++++----------------
 src/ocrmypdf/optimize.py        |   2 +-
 tests/conftest.py               |   8 +-
 tests/test_metadata.py          |  17 +-
 tests/test_unpaper.py           |   4 +-
 tests/test_validation.py        |   5 +-
 12 files changed, 472 insertions(+), 429 deletions(-)

diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py
index c62c9409..b6dc0466 100755
--- a/src/ocrmypdf/__main__.py
+++ b/src/ocrmypdf/__main__.py
@@ -26,7 +26,7 @@ from ocrmypdf._plugin_manager import get_plugin_manager
 from ocrmypdf._sync import run_pipeline
 from ocrmypdf._validation import check_closed_streams, check_options
 from ocrmypdf.api import Verbosity, configure_logging
-from ocrmypdf.cli import parser, plugins_only_parser
+from ocrmypdf.cli import get_parser, plugins_only_parser
 from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError
 
 log = logging.getLogger('ocrmypdf')
@@ -34,9 +34,10 @@ log = logging.getLogger('ocrmypdf')
 
 def run(args=None):
     pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
-    if pre_options.plugins:
-        pm = get_plugin_manager(pre_options)
-        pm.hook.install_cli(parser=parser)
+    plugin_manager = get_plugin_manager(pre_options.plugins)
+
+    parser = get_parser()
+    plugin_manager.hook.install_cli(parser=parser)
 
     options = parser.parse_args(args=args)
 
@@ -68,7 +69,7 @@ def run(args=None):
         log.error(e)
         return ExitCode.missing_dependency
 
-    result = run_pipeline(options=options)
+    result = run_pipeline(options=options, plugin_manager=plugin_manager)
     return result
 
 
diff --git a/src/ocrmypdf/_jobcontext.py b/src/ocrmypdf/_jobcontext.py
index ea48380b..7158abe8 100644
--- a/src/ocrmypdf/_jobcontext.py
+++ b/src/ocrmypdf/_jobcontext.py
@@ -69,13 +69,19 @@ class PageContext:
 
     def __getstate__(self):
         state = self.__dict__.copy()
-        del state['plugin_manager']
-        state['construct_plugin_manager'] = partial(get_plugin_manager, self.options)
+        if state['plugin_manager'] is not None:
+            del state['plugin_manager']
+            state['construct_plugin_manager'] = partial(
+                get_plugin_manager, self.options.plugins
+            )
         return state
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-        self.plugin_manager = self.__dict__['construct_plugin_manager']()
+        if 'construct_plugin_manager' in state:
+            self.plugin_manager = state['construct_plugin_manager']()
+        else:
+            self.plugin_manager = None
         del self.__dict__['construct_plugin_manager']
 
 
diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
index a8d6365f..cd82fea4 100644
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -522,9 +522,11 @@ def create_ocr_image(image, page_context):
 
         del draw
 
-        im = page_context.plugin_manager.hook.filter_ocr_image(
+        filter_im = page_context.plugin_manager.hook.filter_ocr_image(
             page=page_context, image=im
         )
+        if filter_im is not None:
+            im = filter_im
 
         # Pillow requires integer DPI
         dpi = tuple(round(coord) for coord in im.info['dpi'])
diff --git a/src/ocrmypdf/_plugin_manager.py b/src/ocrmypdf/_plugin_manager.py
index 2c3df5a1..295e955c 100644
--- a/src/ocrmypdf/_plugin_manager.py
+++ b/src/ocrmypdf/_plugin_manager.py
@@ -16,17 +16,17 @@
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
 
 import importlib
+from typing import List
 
 import pluggy
 
 from ocrmypdf import pluginspec
 
 
-def get_plugin_manager(options):
+def get_plugin_manager(plugins: List[str]):
     pm = pluggy.PluginManager('ocrmypdf')
     pm.add_hookspecs(pluginspec)
-
-    for name in options.plugins:
+    for name in plugins:
         module = importlib.import_module(name)
         pm.register(module)
     return pm
diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py
index 1691eb00..0834993d 100644
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@@ -293,12 +293,14 @@ def configure_debug_logging(log_filename, prefix=''):
     return log_file_handler
 
 
-def run_pipeline(options, api=False):
+def run_pipeline(options, *, plugin_manager, api=False):
     # Any changes to options will not take effect for options that are already
     # bound to function parameters in the pipeline. (For example
     # options.input_file, options.pdf_renderer are already bound.)
     if not options.jobs:
         options.jobs = available_cpu_count()
+    if not plugin_manager:
+        plugin_manager = get_plugin_manager([])
 
     work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
     debug_log_handler = None
@@ -307,7 +309,6 @@ def run_pipeline(options, api=False):
     ):
         debug_log_handler = configure_debug_logging(Path(work_folder) / "debug.log")
 
-    pm = get_plugin_manager(options)
     try:
         check_requested_output_file(options)
         start_input_file, original_filename = create_input_file(options, work_folder)
@@ -320,7 +321,7 @@ def run_pipeline(options, api=False):
             options,
         )
 
-        pm.hook.prepare(options=options)
+        plugin_manager.hook.prepare(options=options)
 
         # Gather pdfinfo and create context
         pdfinfo = get_pdfinfo(
@@ -329,7 +330,7 @@ def run_pipeline(options, api=False):
             max_workers=options.jobs if not options.use_threads else 1,  # To help debug
         )
 
-        context = PDFContext(options, work_folder, origin_pdf, pdfinfo, pm)
+        context = PDFContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
 
         # Validate options are okay for this pdf
         validate_pdfinfo_options(context)
diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py
index efa24c7f..cc7e103f 100644
--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@@ -18,15 +18,17 @@
 import logging
 import os
 import sys
+from argparse import ArgumentParser
 from contextlib import suppress
 from enum import IntEnum
 from pathlib import Path
 from typing import Dict, Iterable
 
-from ._logging import PageNumberFilter, TqdmConsole
-from ._sync import run_pipeline
-from ._validation import check_options
-from .cli import parser
+from ocrmypdf._logging import PageNumberFilter, TqdmConsole
+from ocrmypdf._plugin_manager import get_plugin_manager
+from ocrmypdf._sync import run_pipeline
+from ocrmypdf._validation import check_options
+from ocrmypdf.cli import get_parser, plugins_only_parser
 
 try:
     import coloredlogs
@@ -125,7 +127,13 @@ def configure_logging(
     return log
 
 
-def create_options(*, input_file: os.PathLike, output_file: os.PathLike, **kwargs):
+def create_options(
+    *,
+    input_file: os.PathLike,
+    output_file: os.PathLike,
+    parser: ArgumentParser,
+    **kwargs,
+):
     cmdline = []
     deferred = []
 
@@ -223,9 +231,11 @@ def ocr(  # pylint: disable=unused-argument
     user_words: os.PathLike = None,
     user_patterns: os.PathLike = None,
     fast_web_view: float = None,
+    plugins: Iterable[str] = None,
     keep_temporary_files: bool = None,
     progress_bar: bool = None,
     tesseract_env: Dict[str, str] = None,
+    **kwargs,
 ):
     """Run OCRmyPDF on one PDF or image.
 
@@ -260,7 +270,15 @@ def ocr(  # pylint: disable=unused-argument
     Returns:
         :class:`ocrmypdf.ExitCode`
     """
+    if not plugins:
+        plugins = []
 
-    options = create_options(**locals())
+    parser = get_parser()
+    _plugin_manager = get_plugin_manager(plugins)
+    _plugin_manager.hook.install_cli(parser=parser)
+
+    options = create_options(
+        **{k: v for k, v in locals().items() if not k.startswith('_')}
+    )
     check_options(options)
-    return run_pipeline(options, api=True)
+    return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py
index d243364a..3edf6cd7 100644
--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@@ -17,10 +17,8 @@
 
 import argparse
 
-from ._version import PROGRAM_NAME as _PROGRAM_NAME
-from ._version import __version__ as _VERSION
-
-__all__ = ['parser']
+from ocrmypdf._version import PROGRAM_NAME as _PROGRAM_NAME
+from ocrmypdf._version import __version__ as _VERSION
 
 
 def numeric(basetype, min_=None, max_=None):
@@ -56,18 +54,19 @@ class ArgumentParser(argparse.ArgumentParser):
         raise ValueError(message)
 
 
-parser = ArgumentParser(
-    prog=_PROGRAM_NAME,
-    fromfile_prefix_chars='@',
-    formatter_class=argparse.RawDescriptionHelpFormatter,
-    description="""\
+def get_parser():
+    parser = ArgumentParser(
+        prog=_PROGRAM_NAME,
+        fromfile_prefix_chars='@',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="""\
 Generates a searchable PDF or PDF/A from a regular PDF.
 
 OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
 rotation and performs image processing, runs the Tesseract OCR engine on the
 image, and then creates a PDF from the OCR information.
 """,
-    epilog="""\
+        epilog="""\
 OCRmyPDF attempts to keep the output file at about the same size.  If a file
 contains losslessly compressed images, and output file will be losslessly
 compressed as well.
@@ -108,395 +107,409 @@ Online documentation is located at:
     https://ocrmypdf.readthedocs.io/en/latest/introduction.html
 
 """,
-)
+    )
 
-parser.add_argument(
-    'input_file',
-    metavar="input_pdf_or_image",
-    help="PDF file containing the images to be OCRed (or '-' to read from "
-    "standard input)",
-)
-parser.add_argument(
-    'output_file',
-    metavar="output_pdf",
-    help="Output searchable PDF file (or '-' to write to standard output). "
-    "Existing files will be ovewritten. If same as input file, the "
-    "input file will be updated only if processing is successful.",
-)
-parser.add_argument(
-    '-l',
-    '--language',
-    action='append',
-    help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
-    "all language packs installed in your system). Use -l eng+deu for "
-    "multiple languages.",
-)
-parser.add_argument(
-    '--image-dpi',
-    metavar='DPI',
-    type=int,
-    help="For input image instead of PDF, use this DPI instead of file's.",
-)
-parser.add_argument(
-    '--output-type',
-    choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
-    default='pdfa',
-    help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
-    "long term archiving (default, recommended) but may not suitable "
-    "for users who want their file altered as little as possible. 'pdfa' "
-    "also has problems with full Unicode text. 'pdf' attempts to "
-    "preserve file contents as much as possible. 'pdf-a1' creates a "
-    "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
-    "PDF/A3-b file.",
-)
+    parser.add_argument(
+        'input_file',
+        metavar="input_pdf_or_image",
+        help="PDF file containing the images to be OCRed (or '-' to read from "
+        "standard input)",
+    )
+    parser.add_argument(
+        'output_file',
+        metavar="output_pdf",
+        help="Output searchable PDF file (or '-' to write to standard output). "
+        "Existing files will be ovewritten. If same as input file, the "
+        "input file will be updated only if processing is successful.",
+    )
+    parser.add_argument(
+        '-l',
+        '--language',
+        action='append',
+        help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
+        "all language packs installed in your system). Use -l eng+deu for "
+        "multiple languages.",
+    )
+    parser.add_argument(
+        '--image-dpi',
+        metavar='DPI',
+        type=int,
+        help="For input image instead of PDF, use this DPI instead of file's.",
+    )
+    parser.add_argument(
+        '--output-type',
+        choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
+        default='pdfa',
+        help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
+        "long term archiving (default, recommended) but may not suitable "
+        "for users who want their file altered as little as possible. 'pdfa' "
+        "also has problems with full Unicode text. 'pdf' attempts to "
+        "preserve file contents as much as possible. 'pdf-a1' creates a "
+        "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
+        "PDF/A3-b file.",
+    )
 
-# Use null string '\0' as sentinel to indicate the user supplied no argument,
-# since that is the only invalid character for filepaths on all platforms
-# bool('\0') is True in Python
-parser.add_argument(
-    '--sidecar',
-    nargs='?',
-    const='\0',
-    default=None,
-    metavar='FILE',
-    help="Generate sidecar text files that contain the same text recognized "
-    "by Tesseract. This may be useful for building a OCR text database. "
-    "If FILE is omitted, the sidecar file be named {output_file}.txt "
-    "If FILE is set to '-', the sidecar is written to stdout (a "
-    "convenient way to preview OCR quality). The output file and sidecar "
-    "may not both use stdout at the same time.",
-)
+    # Use null string '\0' as sentinel to indicate the user supplied no argument,
+    # since that is the only invalid character for filepaths on all platforms
+    # bool('\0') is True in Python
+    parser.add_argument(
+        '--sidecar',
+        nargs='?',
+        const='\0',
+        default=None,
+        metavar='FILE',
+        help="Generate sidecar text files that contain the same text recognized "
+        "by Tesseract. This may be useful for building a OCR text database. "
+        "If FILE is omitted, the sidecar file be named {output_file}.txt "
+        "If FILE is set to '-', the sidecar is written to stdout (a "
+        "convenient way to preview OCR quality). The output file and sidecar "
+        "may not both use stdout at the same time.",
+    )
 
-parser.add_argument(
-    '--version',
-    action='version',
-    version=_VERSION,
-    help="Print program version and exit",
-)
+    parser.add_argument(
+        '--version',
+        action='version',
+        version=_VERSION,
+        help="Print program version and exit",
+    )
 
-jobcontrol = parser.add_argument_group("Job control options")
-jobcontrol.add_argument(
-    '-j',
-    '--jobs',
-    metavar='N',
-    type=numeric(int, 0, 256),
-    help="Use up to N CPU cores simultaneously (default: use all).",
-)
-jobcontrol.add_argument(
-    '-q', '--quiet', action='store_true', help="Suppress INFO messages"
-)
-jobcontrol.add_argument(
-    '-v',
-    '--verbose',
-    type=numeric(int, 0, 2),
-    default=0,
-    const=1,
-    nargs='?',
-    help="Print more verbose messages for each additional verbose level. Use "
-    "`-v 1` typically for much more detailed logging. Higher numbers "
-    "are probably only useful in debugging.",
-)
-jobcontrol.add_argument(
-    '--no-progress-bar',
-    action='store_false',
-    dest='progress_bar',
-    help=argparse.SUPPRESS,
-)
-jobcontrol.add_argument('--use-threads', action='store_true', help=argparse.SUPPRESS)
+    jobcontrol = parser.add_argument_group("Job control options")
+    jobcontrol.add_argument(
+        '-j',
+        '--jobs',
+        metavar='N',
+        type=numeric(int, 0, 256),
+        help="Use up to N CPU cores simultaneously (default: use all).",
+    )
+    jobcontrol.add_argument(
+        '-q', '--quiet', action='store_true', help="Suppress INFO messages"
+    )
+    jobcontrol.add_argument(
+        '-v',
+        '--verbose',
+        type=numeric(int, 0, 2),
+        default=0,
+        const=1,
+        nargs='?',
+        help="Print more verbose messages for each additional verbose level. Use "
+        "`-v 1` typically for much more detailed logging. Higher numbers "
+        "are probably only useful in debugging.",
+    )
+    jobcontrol.add_argument(
+        '--no-progress-bar',
+        action='store_false',
+        dest='progress_bar',
+        help=argparse.SUPPRESS,
+    )
+    jobcontrol.add_argument(
+        '--use-threads', action='store_true', help=argparse.SUPPRESS
+    )
 
-metadata = parser.add_argument_group(
-    "Metadata options",
-    "Set output PDF/A metadata (default: copy input document's metadata)",
-)
-metadata.add_argument(
-    '--title', type=str, help="Set document title (place multiple words in quotes)"
-)
-metadata.add_argument('--author', type=str, help="Set document author")
-metadata.add_argument('--subject', type=str, help="Set document subject description")
-metadata.add_argument('--keywords', type=str, help="Set document keywords")
+    metadata = parser.add_argument_group(
+        "Metadata options",
+        "Set output PDF/A metadata (default: copy input document's metadata)",
+    )
+    metadata.add_argument(
+        '--title', type=str, help="Set document title (place multiple words in quotes)"
+    )
+    metadata.add_argument('--author', type=str, help="Set document author")
+    metadata.add_argument(
+        '--subject', type=str, help="Set document subject description"
+    )
+    metadata.add_argument('--keywords', type=str, help="Set document keywords")
 
-preprocessing = parser.add_argument_group(
-    "Image preprocessing options",
-    "Options to improve the quality of the final PDF and OCR",
-)
-preprocessing.add_argument(
-    '-r',
-    '--rotate-pages',
-    action='store_true',
-    help="Automatically rotate pages based on detected text orientation",
-)
-preprocessing.add_argument(
-    '--remove-background',
-    action='store_true',
-    help="Attempt to remove background from gray or color pages, setting it "
-    "to white ",
-)
-preprocessing.add_argument(
-    '-d', '--deskew', action='store_true', help="Deskew each page before performing OCR"
-)
-preprocessing.add_argument(
-    '-c',
-    '--clean',
-    action='store_true',
-    help="Clean pages from scanning artifacts before performing OCR, and send "
-    "the cleaned page to OCR, but do not include the cleaned page in "
-    "the output",
-)
-preprocessing.add_argument(
-    '-i',
-    '--clean-final',
-    action='store_true',
-    help="Clean page as above, and incorporate the cleaned image in the final "
-    "PDF.  Might remove desired content.",
-)
-preprocessing.add_argument(
-    '--unpaper-args',
-    type=str,
-    default=None,
-    help="A quoted string of arguments to pass to unpaper. Requires --clean. "
-    "Example: --unpaper-args '--layout double'.",
-)
-preprocessing.add_argument(
-    '--oversample',
-    metavar='DPI',
-    type=numeric(int, 0, 5000),
-    default=0,
-    help="Oversample images to at least the specified DPI, to improve OCR "
-    "results slightly",
-)
-preprocessing.add_argument(
-    '--remove-vectors',
-    action='store_true',
-    help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
-    "will not be included in OCR. This can eliminate false characters.",
-)
-preprocessing.add_argument(
-    '--threshold',
-    action='store_true',
-    help="EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract for OCR. Can "
-    "improve OCR quality compared to Tesseract's thresholder.",
-)
+    preprocessing = parser.add_argument_group(
+        "Image preprocessing options",
+        "Options to improve the quality of the final PDF and OCR",
+    )
+    preprocessing.add_argument(
+        '-r',
+        '--rotate-pages',
+        action='store_true',
+        help="Automatically rotate pages based on detected text orientation",
+    )
+    preprocessing.add_argument(
+        '--remove-background',
+        action='store_true',
+        help="Attempt to remove background from gray or color pages, setting it "
+        "to white ",
+    )
+    preprocessing.add_argument(
+        '-d',
+        '--deskew',
+        action='store_true',
+        help="Deskew each page before performing OCR",
+    )
+    preprocessing.add_argument(
+        '-c',
+        '--clean',
+        action='store_true',
+        help="Clean pages from scanning artifacts before performing OCR, and send "
+        "the cleaned page to OCR, but do not include the cleaned page in "
+        "the output",
+    )
+    preprocessing.add_argument(
+        '-i',
+        '--clean-final',
+        action='store_true',
+        help="Clean page as above, and incorporate the cleaned image in the final "
+        "PDF.  Might remove desired content.",
+    )
+    preprocessing.add_argument(
+        '--unpaper-args',
+        type=str,
+        default=None,
+        help="A quoted string of arguments to pass to unpaper. Requires --clean. "
+        "Example: --unpaper-args '--layout double'.",
+    )
+    preprocessing.add_argument(
+        '--oversample',
+        metavar='DPI',
+        type=numeric(int, 0, 5000),
+        default=0,
+        help="Oversample images to at least the specified DPI, to improve OCR "
+        "results slightly",
+    )
+    preprocessing.add_argument(
+        '--remove-vectors',
+        action='store_true',
+        help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
+        "will not be included in OCR. This can eliminate false characters.",
+    )
+    preprocessing.add_argument(
+        '--threshold',
+        action='store_true',
+        help=(
+            "EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract "
+            "for OCR. Can improve OCR quality compared to Tesseract's thresholder."
+        ),
+    )
 
-ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
-ocrsettings.add_argument(
-    '-f',
-    '--force-ocr',
-    action='store_true',
-    help="Rasterize any text or vector objects on each page, apply OCR, and "
-    "save the rastered output (this rewrites the PDF)",
-)
-ocrsettings.add_argument(
-    '-s',
-    '--skip-text',
-    action='store_true',
-    help="Skip OCR on any pages that already contain text, but include the "
-    "page in final output; useful for PDFs that contain a mix of "
-    "images, text pages, and/or previously OCRed pages",
-)
-ocrsettings.add_argument(
-    '--redo-ocr',
-    action='store_true',
-    help="Attempt to detect and remove the hidden OCR layer from files that "
-    "were previously OCRed with OCRmyPDF or another program. Apply OCR "
-    "to text found in raster images. Existing visible text objects will "
-    "not be changed. If there is no existing OCR, OCR will be added.",
-)
-ocrsettings.add_argument(
-    '--skip-big',
-    type=numeric(float, 0, 5000),
-    metavar='MPixels',
-    help="Skip OCR on pages larger than the specified amount of megapixels, "
-    "but include skipped pages in final output",
-)
+    ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
+    ocrsettings.add_argument(
+        '-f',
+        '--force-ocr',
+        action='store_true',
+        help="Rasterize any text or vector objects on each page, apply OCR, and "
+        "save the rastered output (this rewrites the PDF)",
+    )
+    ocrsettings.add_argument(
+        '-s',
+        '--skip-text',
+        action='store_true',
+        help="Skip OCR on any pages that already contain text, but include the "
+        "page in final output; useful for PDFs that contain a mix of "
+        "images, text pages, and/or previously OCRed pages",
+    )
+    ocrsettings.add_argument(
+        '--redo-ocr',
+        action='store_true',
+        help="Attempt to detect and remove the hidden OCR layer from files that "
+        "were previously OCRed with OCRmyPDF or another program. Apply OCR "
+        "to text found in raster images. Existing visible text objects will "
+        "not be changed. If there is no existing OCR, OCR will be added.",
+    )
+    ocrsettings.add_argument(
+        '--skip-big',
+        type=numeric(float, 0, 5000),
+        metavar='MPixels',
+        help="Skip OCR on pages larger than the specified amount of megapixels, "
+        "but include skipped pages in final output",
+    )
 
-optimizing = parser.add_argument_group(
-    "Optimization options", "Control how the PDF is optimized after OCR"
-)
-optimizing.add_argument(
-    '-O',
-    '--optimize',
-    type=int,
-    choices=range(0, 4),
-    default=1,
-    help=(
-        "Control how PDF is optimized after processing:"
-        "0 - do not optimize; "
-        "1 - do safe, lossless optimizations (default); "
-        "2 - do some lossy optimizations; "
-        "3 - do aggressive lossy optimizations (including lossy JBIG2)"
-    ),
-)
-optimizing.add_argument(
-    '--jpeg-quality',
-    type=numeric(int, 0, 100),
-    default=0,
-    metavar='Q',
-    help=(
-        "Adjust JPEG quality level for JPEG optimization. "
-        "100 is best quality and largest output size; "
-        "1 is lowest quality and smallest output; "
-        "0 uses the default."
-    ),
-)
-optimizing.add_argument(
-    '--jpg-quality',
-    type=numeric(int, 0, 100),
-    default=0,
-    metavar='Q',
-    dest='jpeg_quality',
-    help=argparse.SUPPRESS,  # Alias for --jpeg-quality
-)
-optimizing.add_argument(
-    '--png-quality',
-    type=numeric(int, 0, 100),
-    default=0,
-    metavar='Q',
-    help=(
-        "Adjust PNG quality level to use when quantizing PNGs. "
-        "Values have same meaning as with --jpeg-quality"
-    ),
-)
-optimizing.add_argument(
-    '--jbig2-lossy',
-    action='store_true',
-    help=(
-        "Enable JBIG2 lossy mode (better compression, not suitable for some "
-        "use cases - see documentation)."
-    ),
-)
-optimizing.add_argument(
-    '--jbig2-page-group-size',
-    type=numeric(int, 1, 10000),
-    default=0,
-    metavar='N',
-    # Adjust number of pages to consider at once for JBIG2 compression
-    help=argparse.SUPPRESS,
-)
+    optimizing = parser.add_argument_group(
+        "Optimization options", "Control how the PDF is optimized after OCR"
+    )
+    optimizing.add_argument(
+        '-O',
+        '--optimize',
+        type=int,
+        choices=range(0, 4),
+        default=1,
+        help=(
+            "Control how PDF is optimized after processing:"
+            "0 - do not optimize; "
+            "1 - do safe, lossless optimizations (default); "
+            "2 - do some lossy optimizations; "
+            "3 - do aggressive lossy optimizations (including lossy JBIG2)"
+        ),
+    )
+    optimizing.add_argument(
+        '--jpeg-quality',
+        type=numeric(int, 0, 100),
+        default=0,
+        metavar='Q',
+        help=(
+            "Adjust JPEG quality level for JPEG optimization. "
+            "100 is best quality and largest output size; "
+            "1 is lowest quality and smallest output; "
+            "0 uses the default."
+        ),
+    )
+    optimizing.add_argument(
+        '--jpg-quality',
+        type=numeric(int, 0, 100),
+        default=0,
+        metavar='Q',
+        dest='jpeg_quality',
+        help=argparse.SUPPRESS,  # Alias for --jpeg-quality
+    )
+    optimizing.add_argument(
+        '--png-quality',
+        type=numeric(int, 0, 100),
+        default=0,
+        metavar='Q',
+        help=(
+            "Adjust PNG quality level to use when quantizing PNGs. "
+            "Values have same meaning as with --jpeg-quality"
+        ),
+    )
+    optimizing.add_argument(
+        '--jbig2-lossy',
+        action='store_true',
+        help=(
+            "Enable JBIG2 lossy mode (better compression, not suitable for some "
+            "use cases - see documentation)."
+        ),
+    )
+    optimizing.add_argument(
+        '--jbig2-page-group-size',
+        type=numeric(int, 1, 10000),
+        default=0,
+        metavar='N',
+        # Adjust number of pages to consider at once for JBIG2 compression
+        help=argparse.SUPPRESS,
+    )
 
-advanced = parser.add_argument_group(
-    "Advanced", "Advanced options to control Tesseract's OCR behavior"
-)
-advanced.add_argument(
-    '--pages',
-    type=str,
-    help="Limit OCR to the specified pages (ranges or comma separated), skipping others",
-)
-advanced.add_argument(
-    '--max-image-mpixels',
-    action='store',
-    type=numeric(float, 0),
-    metavar='MPixels',
-    help="Set maximum number of pixels to unpack before treating an image as a "
-    "decompression bomb",
-    default=128.0,
-)
-advanced.add_argument(
-    '--tesseract-config',
-    action='append',
-    metavar='CFG',
-    default=[],
-    help="Additional Tesseract configuration files -- see documentation",
-)
-advanced.add_argument(
-    '--tesseract-pagesegmode',
-    action='store',
-    type=int,
-    metavar='PSM',
-    choices=range(0, 14),
-    help="Set Tesseract page segmentation mode (see tesseract --help)",
-)
-advanced.add_argument(
-    '--tesseract-oem',
-    action='store',
-    type=int,
-    metavar='MODE',
-    choices=range(0, 4),
-    help=(
-        "Set Tesseract 4.0 OCR engine mode: "
-        "0 - original Tesseract only; "
-        "1 - neural nets LSTM only; "
-        "2 - Tesseract + LSTM; "
-        "3 - default."
-    ),
-)
-advanced.add_argument(
-    '--pdf-renderer',
-    choices=['auto', 'hocr', 'sandwich'],
-    default='auto',
-    help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
-    "choose.  See documentation for discussion.",
-)
-advanced.add_argument(
-    '--tesseract-timeout',
-    default=180.0,
-    type=numeric(float, 0),
-    metavar='SECONDS',
-    help='Give up on OCR after the timeout, but copy the preprocessed page '
-    'into the final output',
-)
-advanced.add_argument(
-    '--rotate-pages-threshold',
-    default=14.0,
-    type=numeric(float, 0, 1000),
-    metavar='CONFIDENCE',
-    help="Only rotate pages when confidence is above this value (arbitrary "
-    "units reported by tesseract)",
-)
-advanced.add_argument(
-    '--pdfa-image-compression',
-    choices=['auto', 'jpeg', 'lossless'],
-    default='auto',
-    help="Specify how to compress images in the output PDF/A. 'auto' lets "
-    "OCRmyPDF decide.  'jpeg' changes all grayscale and color images to "
-    "JPEG compression.  'lossless' uses PNG-style lossless compression "
-    "for all images.  Monochrome images are always compressed using a "
-    "lossless codec.  Compression settings "
-    "are applied to all pages, including those for which OCR was "
-    "skipped.  Not supported for --output-type=pdf ; that setting "
-    "preserves the original compression of all images.",
-)
-advanced.add_argument(
-    '--user-words',
-    metavar='FILE',
-    help="Specify the location of the Tesseract user words file. This is a "
-    "list of words Tesseract should consider while performing OCR in "
-    "addition to its standard language dictionaries. This can improve "
-    "OCR quality especially for specialized and technical documents.",
-)
-advanced.add_argument(
-    '--user-patterns',
-    metavar='FILE',
-    help="Specify the location of the Tesseract user patterns file.",
-)
-advanced.add_argument(
-    '--fast-web-view',
-    type=numeric(float, 0),
-    default=1.0,
-    metavar="MEGABYTES",
-    help="If the size of file is more than this threshold (in MB), then "
-    "linearize the PDF for fast web viewing. This allows the PDF to be "
-    "displayed before it is fully downloaded in web browsers, but increases "
-    "the space required slightly. By default we skip this for small files "
-    "which do not benefit. If the threshold is 0 it will be apply to all files. "
-    "Set the threshold very high to disable.",
-)
-advanced.add_argument(
-    '--plugins',
-    action='append',
-    default=[],
-    help="Path to a folder than contains plugins.",
-)
+    advanced = parser.add_argument_group(
+        "Advanced", "Advanced options to control Tesseract's OCR behavior"
+    )
+    advanced.add_argument(
+        '--pages',
+        type=str,
+        help=(
+            "Limit OCR to the specified pages (ranges or comma separated), "
+            "skipping others",
+        ),
+    )
+    advanced.add_argument(
+        '--max-image-mpixels',
+        action='store',
+        type=numeric(float, 0),
+        metavar='MPixels',
+        help="Set maximum number of pixels to unpack before treating an image as a "
+        "decompression bomb",
+        default=128.0,
+    )
+    advanced.add_argument(
+        '--tesseract-config',
+        action='append',
+        metavar='CFG',
+        default=[],
+        help="Additional Tesseract configuration files -- see documentation",
+    )
+    advanced.add_argument(
+        '--tesseract-pagesegmode',
+        action='store',
+        type=int,
+        metavar='PSM',
+        choices=range(0, 14),
+        help="Set Tesseract page segmentation mode (see tesseract --help)",
+    )
+    advanced.add_argument(
+        '--tesseract-oem',
+        action='store',
+        type=int,
+        metavar='MODE',
+        choices=range(0, 4),
+        help=(
+            "Set Tesseract 4.0 OCR engine mode: "
+            "0 - original Tesseract only; "
+            "1 - neural nets LSTM only; "
+            "2 - Tesseract + LSTM; "
+            "3 - default."
+        ),
+    )
+    advanced.add_argument(
+        '--pdf-renderer',
+        choices=['auto', 'hocr', 'sandwich'],
+        default='auto',
+        help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
+        "choose.  See documentation for discussion.",
+    )
+    advanced.add_argument(
+        '--tesseract-timeout',
+        default=180.0,
+        type=numeric(float, 0),
+        metavar='SECONDS',
+        help='Give up on OCR after the timeout, but copy the preprocessed page '
+        'into the final output',
+    )
+    advanced.add_argument(
+        '--rotate-pages-threshold',
+        default=14.0,
+        type=numeric(float, 0, 1000),
+        metavar='CONFIDENCE',
+        help="Only rotate pages when confidence is above this value (arbitrary "
+        "units reported by tesseract)",
+    )
+    advanced.add_argument(
+        '--pdfa-image-compression',
+        choices=['auto', 'jpeg', 'lossless'],
+        default='auto',
+        help="Specify how to compress images in the output PDF/A. 'auto' lets "
+        "OCRmyPDF decide.  'jpeg' changes all grayscale and color images to "
+        "JPEG compression.  'lossless' uses PNG-style lossless compression "
+        "for all images.  Monochrome images are always compressed using a "
+        "lossless codec.  Compression settings "
+        "are applied to all pages, including those for which OCR was "
+        "skipped.  Not supported for --output-type=pdf ; that setting "
+        "preserves the original compression of all images.",
+    )
+    advanced.add_argument(
+        '--user-words',
+        metavar='FILE',
+        help="Specify the location of the Tesseract user words file. This is a "
+        "list of words Tesseract should consider while performing OCR in "
+        "addition to its standard language dictionaries. This can improve "
+        "OCR quality especially for specialized and technical documents.",
+    )
+    advanced.add_argument(
+        '--user-patterns',
+        metavar='FILE',
+        help="Specify the location of the Tesseract user patterns file.",
+    )
+    advanced.add_argument(
+        '--fast-web-view',
+        type=numeric(float, 0),
+        default=1.0,
+        metavar="MEGABYTES",
+        help="If the size of file is more than this threshold (in MB), then "
+        "linearize the PDF for fast web viewing. This allows the PDF to be "
+        "displayed before it is fully downloaded in web browsers, but increases "
+        "the space required slightly. By default we skip this for small files "
+        "which do not benefit. If the threshold is 0 it will be apply to all files. "
+        "Set the threshold very high to disable.",
+    )
+    advanced.add_argument(
+        '--plugins',
+        action='append',
+        default=[],
+        help="Path to a folder than contains plugins.",
+    )
+
+    debugging = parser.add_argument_group(
+        "Debugging", "Arguments to help with troubleshooting and debugging"
+    )
+    debugging.add_argument(
+        '-k',
+        '--keep-temporary-files',
+        action='store_true',
+        help="Keep temporary files (helpful for debugging)",
+    )
+    debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
+    return parser
 
-debugging = parser.add_argument_group(
-    "Debugging", "Arguments to help with troubleshooting and debugging"
-)
-debugging.add_argument(
-    '-k',
-    '--keep-temporary-files',
-    action='store_true',
-    help="Keep temporary files (helpful for debugging)",
-)
-debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
 
 plugins_only_parser = ArgumentParser(
     prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False
diff --git a/src/ocrmypdf/optimize.py b/src/ocrmypdf/optimize.py
index e6f0bd3f..9459b70e 100644
--- a/src/ocrmypdf/optimize.py
+++ b/src/ocrmypdf/optimize.py
@@ -582,7 +582,7 @@ def main(infile, outfile, level, jobs=1):
     )
 
     with TemporaryDirectory() as td:
-        context = PDFContext(options, td, infile, None)
+        context = PDFContext(options, td, infile, None, None)
         tmpout = Path(td) / 'out.pdf'
         optimize(
             infile,
diff --git a/tests/conftest.py b/tests/conftest.py
index 8724ead5..54960cdc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -214,7 +214,7 @@ def no_outpdf(tmp_path):
 def check_ocrmypdf(input_file, output_file, *args, env=None):
     """Run ocrmypdf and confirmed that a valid file was created"""
 
-    options = cli.parser.parse_args(
+    options = cli.get_parser().parse_args(
         [str(input_file), str(output_file)]
         + [str(arg) for arg in args if arg is not None]
     )
@@ -222,7 +222,7 @@ def check_ocrmypdf(input_file, output_file, *args, env=None):
     if env:
         options.tesseract_env = env
         options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
-    result = api.run_pipeline(options, api=True)
+    result = api.run_pipeline(options, plugin_manager=None, api=True)
 
     assert result == 0
     assert os.path.exists(str(output_file)), "Output file not created"
@@ -238,7 +238,7 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None):
     Does not currently have a way to manipulate the PATH except for Tesseract.
     """
 
-    options = cli.parser.parse_args(
+    options = cli.get_parser().parse_args(
         [str(input_file), str(output_file)]
         + [str(arg) for arg in args if arg is not None]
     )
@@ -253,7 +253,7 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None):
     if options.tesseract_env:
         assert all(isinstance(v, (str, bytes)) for v in options.tesseract_env.values())
 
-    return api.run_pipeline(options, api=False)
+    return api.run_pipeline(options, plugin_manager=None, api=False)
 
 
 @pytest.helpers.register
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index 270a8b62..79e63604 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -32,7 +32,7 @@ from pikepdf.models.metadata import decode_pdf_date
 
 from ocrmypdf._jobcontext import PDFContext
 from ocrmypdf._pipeline import convert_to_pdfa
-from ocrmypdf.cli import parser
+from ocrmypdf.cli import get_parser
 from ocrmypdf.exceptions import ExitCode
 from ocrmypdf.pdfa import SRGB_ICC_PROFILE, file_claims_pdfa, generate_pdfa_ps
 from ocrmypdf.pdfinfo import PdfInfo
@@ -290,16 +290,15 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
 
 
 def test_metadata_fixup_warning(resources, outdir, caplog):
-    from ocrmypdf.__main__ import parser
     from ocrmypdf._pipeline import metadata_fixup
 
-    options = parser.parse_args(
+    options = get_parser().parse_args(
         args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
     )
 
     copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')
 
-    context = PDFContext(options, outdir, outdir / 'graph.pdf', None)
+    context = PDFContext(options, outdir, outdir / 'graph.pdf', None, None)
     metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
     for record in caplog.records:
         assert record.levelname != 'WARNING'
@@ -310,7 +309,7 @@ def test_metadata_fixup_warning(resources, outdir, caplog):
         meta['prism2:publicationName'] = 'OCRmyPDF Test'
     graph.save(outdir / 'graph_mod.pdf')
 
-    context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None)
+    context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None, None)
     metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
     assert any(record.levelname == 'WARNING' for record in caplog.records)
 
@@ -326,11 +325,11 @@ def test_prevent_gs_invalid_xml(resources, outdir):
             Title=b'String with trailing nul\x00'
         )
 
-    options = parser.parse_args(
+    options = get_parser().parse_args(
         args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
     )
     pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
-    context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo)
+    context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, None)
 
     convert_to_pdfa(
         str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
@@ -357,11 +356,11 @@ def test_malformed_docinfo(caplog, resources, outdir):
         pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
         pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)
 
-    options = parser.parse_args(
+    options = get_parser().parse_args(
         args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
     )
     pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
-    context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo)
+    context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, None)
 
     convert_to_pdfa(
         str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py
index f0e90b80..7753d340 100644
--- a/tests/test_unpaper.py
+++ b/tests/test_unpaper.py
@@ -21,7 +21,7 @@ from unittest.mock import patch
 import pytest
 
 from ocrmypdf._validation import check_options
-from ocrmypdf.cli import parser
+from ocrmypdf.cli import get_parser
 from ocrmypdf.exceptions import ExitCode, MissingDependencyError
 from ocrmypdf.exec import unpaper
 
@@ -51,7 +51,7 @@ def spoof_unpaper_oldversion(tmp_path_factory):
 def test_no_unpaper(resources, no_outpdf):
     input_ = fspath(resources / "c02-22.pdf")
     output = fspath(no_outpdf)
-    options = parser.parse_args(args=["--clean", input_, output])
+    options = get_parser().parse_args(args=["--clean", input_, output])
 
     with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
         mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
diff --git a/tests/test_validation.py b/tests/test_validation.py
index f183aa46..1a453e78 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -23,6 +23,7 @@ import pytest
 
 import ocrmypdf._validation as vd
 from ocrmypdf.api import create_options
+from ocrmypdf.cli import get_parser
 from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
 from ocrmypdf.pdfinfo import PdfInfo
 
@@ -30,7 +31,9 @@ from ocrmypdf.pdfinfo import PdfInfo
 def make_opts(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs):
     if language is not None:
         kwargs['language'] = language
-    return create_options(input_file=input_file, output_file=output_file, **kwargs)
+    return create_options(
+        input_file=input_file, output_file=output_file, parser=get_parser(), **kwargs
+    )
 
 
 def test_hocr_notlatin_warning(caplog):