From aa060db5bc7f2c4017902360cc2b039bfca4d8bc Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <james@purplerock.ca>
Date: Tue, 26 May 2020 02:13:17 -0700
Subject: [PATCH] Refactor tesseract_env variable into the plugin

Removed all cases except one in api.py, which isn't worth solving because
it should be removed anyway.

This also fixes a logic error in the OMP_THREAD_LIMIT decision, api.py
did not use pass kwargs correctly so they never worked before.
---
 src/ocrmypdf/_plugin_manager.py               |  5 +++-
 src/ocrmypdf/_sync.py                         | 19 --------------
 src/ocrmypdf/api.py                           | 19 ++++++--------
 src/ocrmypdf/builtin_plugins/tesseract_ocr.py | 25 +++++++++++++++++++
 src/ocrmypdf/cli.py                           |  1 -
 src/ocrmypdf/pluginspec.py                    |  2 +-
 tests/test_unpaper.py                         |  8 +++---
 7 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/src/ocrmypdf/_plugin_manager.py b/src/ocrmypdf/_plugin_manager.py
index 6216a44b..1fbc1b76 100644
--- a/src/ocrmypdf/_plugin_manager.py
+++ b/src/ocrmypdf/_plugin_manager.py
@@ -15,6 +15,7 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
 
+import argparse
 import importlib
 import importlib.util
 import sys
@@ -50,7 +51,9 @@ def get_plugin_manager(plugins: List[str], builtins=True):
     return pm
 
 
-def get_parser_options_plugins(args):
+def get_parser_options_plugins(
+    args,
+) -> (argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager):
     pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
     plugin_manager = get_plugin_manager(pre_options.plugins)
 
diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py
index 87302438..60eed5de 100644
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@@ -216,25 +216,6 @@ def exec_concurrent(context):
     if max_workers > 1:
         log.info("Start processing %d pages concurrently", max_workers)
 
-    # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
-    # to manage how many threads it uses to avoid creating total threads than cores.
-    # Performance testing shows we're better off
-    # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
-    # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
-    # input file is small, then we allow Tesseract to use threads, subject to the
-    # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
-    # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
-    tess_threads = min(3, context.options.jobs // max_workers)
-    if context.options.tesseract_env is None:
-        context.options.tesseract_env = os.environ.copy()
-    context.options.tesseract_env.setdefault('OMP_THREAD_LIMIT', str(tess_threads))
-    try:
-        tess_threads = int(context.options.tesseract_env['OMP_THREAD_LIMIT'])
-    except ValueError:  # OMP_THREAD_LIMIT initialized to non-numeric
-        context.log.error("Environment variable OMP_THREAD_LIMIT is not numeric")
-    if tess_threads > 1:
-        log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
-
     sidecars = [None] * len(context.pdfinfo)
     ocrgraft = OcrGrafter(context)
 
diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py
index 7edce482..68ede13a 100644
--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@@ -15,6 +15,7 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
 
+import inspect
 import logging
 import os
 import sys
@@ -178,11 +179,6 @@ def create_options(
     options = parser.parse_args(cmdline)
     for keyword, val in deferred:
         setattr(options, keyword, val)
-
-    # If we are running a Tesseract spoof, ensure it knows what the input file is
-    if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
-        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
-
     return options
 
 
@@ -233,7 +229,6 @@ def ocr(  # pylint: disable=unused-argument
     plugins: Iterable[str] = None,
     keep_temporary_files: bool = None,
     progress_bar: bool = None,
-    tesseract_env: Dict[str, str] = None,
     **kwargs,
 ):
     """Run OCRmyPDF on one PDF or image.
@@ -245,7 +240,6 @@ def ocr(  # pylint: disable=unused-argument
         use_threads (bool): Use worker threads instead of processes. This reduces
             performance but may make debugging easier since it is easier to set
             breakpoints.
-        tesseract_env (dict): Override environment variables for Tesseract
     Raises:
         ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
             with the OCR layer.
@@ -274,10 +268,13 @@ def ocr(  # pylint: disable=unused-argument
 
     parser = get_parser()
     _plugin_manager = get_plugin_manager(plugins)
-    _plugin_manager.hook.add_options(parser=parser)
+    _plugin_manager.hook.add_options(parser=parser)  # pylint: disable=no-member
 
-    options = create_options(
-        **{k: v for k, v in locals().items() if not k.startswith('_')}
-    )
+    create_options_kwargs = {
+        k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs'
+    }
+    create_options_kwargs.update(kwargs)
+
+    options = create_options(**create_options_kwargs)
     check_options(options, _plugin_manager)
     return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
index e2fcfb6d..4991baff 100644
--- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
+++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
@@ -15,7 +15,9 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
 
+import argparse
 import logging
+import os
 
 from ocrmypdf import hookimpl
 from ocrmypdf.cli import numeric
@@ -79,6 +81,7 @@ def add_options(parser):
         metavar='FILE',
         help="Specify the location of the Tesseract user patterns file.",
     )
+    tess.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
 
 
 @hookimpl
@@ -115,6 +118,28 @@ def check_options(options):
         )
 
 
+@hookimpl
+def validate(pdfinfo, options):
+    # If we are running a Tesseract spoof, ensure it knows what the input file is
+    if os.environ.get('PYTEST_CURRENT_TEST') and options.tesseract_env:
+        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(options.input_file)
+
+    # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
+    # to manage how many threads it uses to avoid creating total threads than cores.
+    # Performance testing shows we're better off
+    # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
+    # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
+    # input file is small, then we allow Tesseract to use threads, subject to the
+    # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
+    # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
+    if not options.tesseract_env.get('OMP_THREAD_LIMIT', '').isnumeric():
+        tess_threads = min(3, options.jobs // len(pdfinfo), len(pdfinfo))
+        options.tesseract_env['OMP_THREAD_LIMIT'] = str(tess_threads)
+
+    if tess_threads > 1:
+        log.info("Using Tesseract OpenMP thread limit %d", tess_threads)
+
+
 class TesseractOcrEngine(OcrEngine):
     @staticmethod
     def version():
diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py
index ccf66d99..a34a2108 100644
--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@@ -474,7 +474,6 @@ Online documentation is located at:
         action='store_true',
         help="Keep temporary files (helpful for debugging)",
     )
-    debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
     return parser
 
 
diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py
index 432b1946..dd659458 100644
--- a/src/ocrmypdf/pluginspec.py
+++ b/src/ocrmypdf/pluginspec.py
@@ -40,7 +40,7 @@ def add_options(parser: ArgumentParser) -> None:
 
 @hookspec
 def check_options(options: Namespace) -> None:
-    """Called to notify a plugin that a file will be processed.
+    """Called to ask the plugin to check all of its options.
 
 	The plugin may modify the *options*. All objects that are in options must
 	be picklable so they can be marshalled to child worker processes.
diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py
index e87800c9..bd04da2c 100644
--- a/tests/test_unpaper.py
+++ b/tests/test_unpaper.py
@@ -20,7 +20,7 @@ from unittest.mock import patch
 
 import pytest
 
-from ocrmypdf._plugin_manager import get_plugin_manager
+from ocrmypdf._plugin_manager import get_parser_options_plugins
 from ocrmypdf._validation import check_options
 from ocrmypdf.cli import get_parser
 from ocrmypdf.exceptions import ExitCode, MissingDependencyError
@@ -43,13 +43,13 @@ def spoof_unpaper_oldversion(tmp_path_factory):
 def test_no_unpaper(resources, no_outpdf):
     input_ = fspath(resources / "c02-22.pdf")
     output = fspath(no_outpdf)
-    options = get_parser().parse_args(args=["--clean", input_, output])
-    plugin_manager = get_plugin_manager(options.plugins)
+
+    _parser, options, pm = get_parser_options_plugins(["--clean", input_, output])
     with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
         mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
 
         with pytest.raises(MissingDependencyError):
-            check_options(options, plugin_manager)
+            check_options(options, pm)
 
 
 def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):