From 78e8bf9cbf4a69f71ff5b2d627e4ea291fd3ee52 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Fri, 20 Sep 2019 17:11:29 -0700 Subject: [PATCH] Use at most 3 Tesseract threads Based on a user suggestion and tesseract-ocr/tesseract#2611, I reviewed thread limits and found that thread limit of 3 is still beneficial, but not 4. > time env OMP_THREAD_LIMIT=2 tesseract omp4.png stdout >/dev/null Warning: Invalid resolution 0 dpi. Using 70 instead. Estimating resolution as 143 116.67user 1.67system 1:26.26elapsed 137%CPU (0avgtext+0avgdata 356752maxresident)k 2213inputs+0outputs (18major+131059minor)pagefaults 0swaps > time env OMP_THREAD_LIMIT=3 tesseract omp4.png stdout >/dev/null Warning: Invalid resolution 0 dpi. Using 70 instead. Estimating resolution as 143 136.89user 1.63system 1:19.56elapsed 174%CPU (0avgtext+0avgdata 356784maxresident)k 821inputs+0outputs (0major+131080minor)pagefaults 0swaps > time env OMP_THREAD_LIMIT=4 tesseract omp4.png stdout >/dev/null Warning: Invalid resolution 0 dpi. Using 70 instead. Estimating resolution as 143 161.31user 1.51system 1:18.80elapsed 206%CPU (0avgtext+0avgdata 356632maxresident)k 8477inputs+0outputs (12major+131074minor)pagefaults 0swaps > time env OMP_THREAD_LIMIT=8 tesseract omp4.png stdout >/dev/null Warning: Invalid resolution 0 dpi. Using 70 instead. Estimating resolution as 143 160.30user 1.62system 1:18.01elapsed 207%CPU (0avgtext+0avgdata 356640maxresident)k 821inputs+0outputs (0major+131078minor)pagefaults 0swaps --- src/ocrmypdf/_sync.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index ccedbdc6..b51238a1 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -225,15 +225,15 @@ def exec_concurrent(context): if max_workers > 1: context.log.info("Start processing %d pages concurrent", max_workers) - # Tesseract 4.0 is multithreaded, and we also run multiple workers. We want to - # avoid the situation where we end up trying to run NxN jobs on N CPU cores, - # as that gives poor performance. Performance testing shows we're better off + # Tesseract 4.x can be multithreaded, and we also run multiple workers. We want + # to manage how many threads it uses to avoid creating total threads than cores. + # Performance testing shows we're better off # parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we # get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the # input file is small, then we allow Tesseract to use threads, subject to the - # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers and limiting - # Tesseract to 4 threads. - tess_threads = min(4, context.options.jobs // max_workers) + # constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers. + # As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system. + tess_threads = min(3, context.options.jobs // max_workers) if context.options.tesseract_env is None: context.options.tesseract_env = os.environ.copy() context.options.tesseract_env.setdefault('OMP_THREAD_LIMIT', str(tess_threads))