From aa060db5bc7f2c4017902360cc2b039bfca4d8bc Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 26 May 2020 02:13:17 -0700 Subject: [PATCH] Refactor tesseract_env variable into the plugin Removed all cases except one in api.py, which isn't worth solving because it should be removed anyway. This also fixes a logic error in the OMP_THREAD_LIMIT decision, api.py did not use pass kwargs correctly so they never worked before. --- src/ocrmypdf/_plugin_manager.py | 5 +++- src/ocrmypdf/_sync.py | 19 -------------- src/ocrmypdf/api.py | 19 ++++++-------- src/ocrmypdf/builtin_plugins/tesseract_ocr.py | 25 +++++++++++++++++++ src/ocrmypdf/cli.py | 1 - src/ocrmypdf/pluginspec.py | 2 +- tests/test_unpaper.py | 8 +++--- 7 files changed, 42 insertions(+), 37 deletions(-) diff --git a/src/ocrmypdf/_plugin_manager.py b/src/ocrmypdf/_plugin_manager.py index 6216a44b..1fbc1b76 100644 --- a/src/ocrmypdf/_plugin_manager.py +++ b/src/ocrmypdf/_plugin_manager.py @@ -15,6 +15,7 @@ # You should have received a copy of the GNU General Public License # along with OCRmyPDF. If not, see . +import argparse import importlib import importlib.util import sys @@ -50,7 +51,9 @@ def get_plugin_manager(plugins: List[str], builtins=True): return pm -def get_parser_options_plugins(args): +def get_parser_options_plugins( + args, +) -> (argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager): pre_options, _unused = plugins_only_parser.parse_known_args(args=args) plugin_manager = get_plugin_manager(pre_options.plugins) diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 87302438..60eed5de 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -216,25 +216,6 @@ def exec_concurrent(context): if max_workers > 1: log.info("Start processing %d pages concurrently", max_workers) - # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want - # to manage how many threads it uses to avoid creating total threads than cores. - # Performance testing shows we're better off - # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we - # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the - # input file is small, then we allow Tesseract to use threads, subject to the - # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers. - # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system. - tess_threads = min(3, context.options.jobs // max_workers) - if context.options.tesseract_env is None: - context.options.tesseract_env = os.environ.copy() - context.options.tesseract_env.setdefault('OMP_THREAD_LIMIT', str(tess_threads)) - try: - tess_threads = int(context.options.tesseract_env['OMP_THREAD_LIMIT']) - except ValueError: # OMP_THREAD_LIMIT initialized to non-numeric - context.log.error("Environment variable OMP_THREAD_LIMIT is not numeric") - if tess_threads > 1: - log.info("Using Tesseract OpenMP thread limit %d", tess_threads) - sidecars = [None] * len(context.pdfinfo) ocrgraft = OcrGrafter(context) diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py index 7edce482..68ede13a 100644 --- a/src/ocrmypdf/api.py +++ b/src/ocrmypdf/api.py @@ -15,6 +15,7 @@ # You should have received a copy of the GNU General Public License # along with OCRmyPDF. If not, see . +import inspect import logging import os import sys @@ -178,11 +179,6 @@ def create_options( options = parser.parse_args(cmdline) for keyword, val in deferred: setattr(options, keyword, val) - - # If we are running a Tesseract spoof, ensure it knows what the input file is - if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env: - options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file) - return options @@ -233,7 +229,6 @@ def ocr( # pylint: disable=unused-argument plugins: Iterable[str] = None, keep_temporary_files: bool = None, progress_bar: bool = None, - tesseract_env: Dict[str, str] = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. @@ -245,7 +240,6 @@ def ocr( # pylint: disable=unused-argument use_threads (bool): Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. - tesseract_env (dict): Override environment variables for Tesseract Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. @@ -274,10 +268,13 @@ def ocr( # pylint: disable=unused-argument parser = get_parser() _plugin_manager = get_plugin_manager(plugins) - _plugin_manager.hook.add_options(parser=parser) + _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member - options = create_options( - **{k: v for k, v in locals().items() if not k.startswith('_')} - ) + create_options_kwargs = { + k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' + } + create_options_kwargs.update(kwargs) + + options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True) diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py index e2fcfb6d..4991baff 100644 --- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py +++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py @@ -15,7 +15,9 @@ # You should have received a copy of the GNU General Public License # along with OCRmyPDF. If not, see . +import argparse import logging +import os from ocrmypdf import hookimpl from ocrmypdf.cli import numeric @@ -79,6 +81,7 @@ def add_options(parser): metavar='FILE', help="Specify the location of the Tesseract user patterns file.", ) + tess.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS) @hookimpl @@ -115,6 +118,28 @@ def check_options(options): ) +@hookimpl +def validate(pdfinfo, options): + # If we are running a Tesseract spoof, ensure it knows what the input file is + if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env: + options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(options.input_file) + + # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want + # to manage how many threads it uses to avoid creating total threads than cores. + # Performance testing shows we're better off + # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we + # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the + # input file is small, then we allow Tesseract to use threads, subject to the + # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers. + # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system. + if not options.tesseract_env.get('OMP_THREAD_LIMIT', '').isnumeric(): + tess_threads = min(3, options.jobs // len(pdfinfo), len(pdfinfo)) + options.tesseract_env['OMP_THREAD_LIMIT'] = str(tess_threads) + + if tess_threads > 1: + log.info("Using Tesseract OpenMP thread limit %d", tess_threads) + + class TesseractOcrEngine(OcrEngine): @staticmethod def version(): diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py index ccf66d99..a34a2108 100644 --- a/src/ocrmypdf/cli.py +++ b/src/ocrmypdf/cli.py @@ -474,7 +474,6 @@ Online documentation is located at: action='store_true', help="Keep temporary files (helpful for debugging)", ) - debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS) return parser diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index 432b1946..dd659458 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -40,7 +40,7 @@ def add_options(parser: ArgumentParser) -> None: @hookspec def check_options(options: Namespace) -> None: - """Called to notify a plugin that a file will be processed. + """Called to ask the plugin to check all of its options. The plugin may modify the *options*. All objects that are in options must be picklable so they can be marshalled to child worker processes. diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py index e87800c9..bd04da2c 100644 --- a/tests/test_unpaper.py +++ b/tests/test_unpaper.py @@ -20,7 +20,7 @@ from unittest.mock import patch import pytest -from ocrmypdf._plugin_manager import get_plugin_manager +from ocrmypdf._plugin_manager import get_parser_options_plugins from ocrmypdf._validation import check_options from ocrmypdf.cli import get_parser from ocrmypdf.exceptions import ExitCode, MissingDependencyError @@ -43,13 +43,13 @@ def spoof_unpaper_oldversion(tmp_path_factory): def test_no_unpaper(resources, no_outpdf): input_ = fspath(resources / "c02-22.pdf") output = fspath(no_outpdf) - options = get_parser().parse_args(args=["--clean", input_, output]) - plugin_manager = get_plugin_manager(options.plugins) + + _parser, options, pm = get_parser_options_plugins(["--clean", input_, output]) with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version: mock_unpaper_version.side_effect = FileNotFoundError("unpaper") with pytest.raises(MissingDependencyError): - check_options(options, plugin_manager) + check_options(options, pm) def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):