diff --git a/src/ocrmypdf/_plugin_manager.py b/src/ocrmypdf/_plugin_manager.py
index 6216a44b..1fbc1b76 100644
--- a/src/ocrmypdf/_plugin_manager.py
+++ b/src/ocrmypdf/_plugin_manager.py
@@ -15,6 +15,7 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see .
+import argparse
import importlib
import importlib.util
import sys
@@ -50,7 +51,9 @@ def get_plugin_manager(plugins: List[str], builtins=True):
return pm
-def get_parser_options_plugins(args):
+def get_parser_options_plugins(
+ args,
+) -> (argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager):
pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
plugin_manager = get_plugin_manager(pre_options.plugins)
diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py
index 87302438..60eed5de 100644
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@@ -216,25 +216,6 @@ def exec_concurrent(context):
if max_workers > 1:
log.info("Start processing %d pages concurrently", max_workers)
- # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
- # to manage how many threads it uses to avoid creating total threads than cores.
- # Performance testing shows we're better off
- # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
- # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
- # input file is small, then we allow Tesseract to use threads, subject to the
- # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
- # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
- tess_threads = min(3, context.options.jobs // max_workers)
- if context.options.tesseract_env is None:
- context.options.tesseract_env = os.environ.copy()
- context.options.tesseract_env.setdefault('OMP_THREAD_LIMIT', str(tess_threads))
- try:
- tess_threads = int(context.options.tesseract_env['OMP_THREAD_LIMIT'])
- except ValueError: # OMP_THREAD_LIMIT initialized to non-numeric
- context.log.error("Environment variable OMP_THREAD_LIMIT is not numeric")
- if tess_threads > 1:
- log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
-
sidecars = [None] * len(context.pdfinfo)
ocrgraft = OcrGrafter(context)
diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py
index 7edce482..68ede13a 100644
--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@@ -15,6 +15,7 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see .
+import inspect
import logging
import os
import sys
@@ -178,11 +179,6 @@ def create_options(
options = parser.parse_args(cmdline)
for keyword, val in deferred:
setattr(options, keyword, val)
-
- # If we are running a Tesseract spoof, ensure it knows what the input file is
- if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
- options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
-
return options
@@ -233,7 +229,6 @@ def ocr( # pylint: disable=unused-argument
plugins: Iterable[str] = None,
keep_temporary_files: bool = None,
progress_bar: bool = None,
- tesseract_env: Dict[str, str] = None,
**kwargs,
):
"""Run OCRmyPDF on one PDF or image.
@@ -245,7 +240,6 @@ def ocr( # pylint: disable=unused-argument
use_threads (bool): Use worker threads instead of processes. This reduces
performance but may make debugging easier since it is easier to set
breakpoints.
- tesseract_env (dict): Override environment variables for Tesseract
Raises:
ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
with the OCR layer.
@@ -274,10 +268,13 @@ def ocr( # pylint: disable=unused-argument
parser = get_parser()
_plugin_manager = get_plugin_manager(plugins)
- _plugin_manager.hook.add_options(parser=parser)
+ _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
- options = create_options(
- **{k: v for k, v in locals().items() if not k.startswith('_')}
- )
+ create_options_kwargs = {
+ k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
+ }
+ create_options_kwargs.update(kwargs)
+
+ options = create_options(**create_options_kwargs)
check_options(options, _plugin_manager)
return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
index e2fcfb6d..4991baff 100644
--- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
+++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
@@ -15,7 +15,9 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see .
+import argparse
import logging
+import os
from ocrmypdf import hookimpl
from ocrmypdf.cli import numeric
@@ -79,6 +81,7 @@ def add_options(parser):
metavar='FILE',
help="Specify the location of the Tesseract user patterns file.",
)
+ tess.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
@hookimpl
@@ -115,6 +118,28 @@ def check_options(options):
)
+@hookimpl
+def validate(pdfinfo, options):
+ # If we are running a Tesseract spoof, ensure it knows what the input file is
+ if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
+ options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(options.input_file)
+
+ # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
+ # to manage how many threads it uses to avoid creating total threads than cores.
+ # Performance testing shows we're better off
+ # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
+ # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
+ # input file is small, then we allow Tesseract to use threads, subject to the
+ # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
+ # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
+ if not options.tesseract_env.get('OMP_THREAD_LIMIT', '').isnumeric():
+ tess_threads = min(3, options.jobs // len(pdfinfo), len(pdfinfo))
+ options.tesseract_env['OMP_THREAD_LIMIT'] = str(tess_threads)
+
+ if tess_threads > 1:
+ log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
+
+
class TesseractOcrEngine(OcrEngine):
@staticmethod
def version():
diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py
index ccf66d99..a34a2108 100644
--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@@ -474,7 +474,6 @@ Online documentation is located at:
action='store_true',
help="Keep temporary files (helpful for debugging)",
)
- debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
return parser
diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py
index 432b1946..dd659458 100644
--- a/src/ocrmypdf/pluginspec.py
+++ b/src/ocrmypdf/pluginspec.py
@@ -40,7 +40,7 @@ def add_options(parser: ArgumentParser) -> None:
@hookspec
def check_options(options: Namespace) -> None:
- """Called to notify a plugin that a file will be processed.
+ """Called to ask the plugin to check all of its options.
The plugin may modify the *options*. All objects that are in options must
be picklable so they can be marshalled to child worker processes.
diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py
index e87800c9..bd04da2c 100644
--- a/tests/test_unpaper.py
+++ b/tests/test_unpaper.py
@@ -20,7 +20,7 @@ from unittest.mock import patch
import pytest
-from ocrmypdf._plugin_manager import get_plugin_manager
+from ocrmypdf._plugin_manager import get_parser_options_plugins
from ocrmypdf._validation import check_options
from ocrmypdf.cli import get_parser
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
@@ -43,13 +43,13 @@ def spoof_unpaper_oldversion(tmp_path_factory):
def test_no_unpaper(resources, no_outpdf):
input_ = fspath(resources / "c02-22.pdf")
output = fspath(no_outpdf)
- options = get_parser().parse_args(args=["--clean", input_, output])
- plugin_manager = get_plugin_manager(options.plugins)
+
+ _parser, options, pm = get_parser_options_plugins(["--clean", input_, output])
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
with pytest.raises(MissingDependencyError):
- check_options(options, plugin_manager)
+ check_options(options, pm)
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):