From 62c4f65fc36c434ceeb5804dfa3835e0d6dfbb15 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 14 Oct 2023 20:39:29 -0700 Subject: [PATCH] Remove duplicate thread local storage of page numbers --- src/ocrmypdf/_pipelines/_common.py | 20 +++++++++++++++----- src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py | 15 +++------------ src/ocrmypdf/_pipelines/ocr.py | 14 ++++---------- src/ocrmypdf/_pipelines/pdf_to_hocr.py | 8 ++------ 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/src/ocrmypdf/_pipelines/_common.py b/src/ocrmypdf/_pipelines/_common.py index 4b207b79..b27c6fa4 100644 --- a/src/ocrmypdf/_pipelines/_common.py +++ b/src/ocrmypdf/_pipelines/_common.py @@ -10,13 +10,14 @@ import logging.handlers import os import shutil import sys +import threading from collections.abc import Sequence from concurrent.futures.process import BrokenProcessPool from concurrent.futures.thread import BrokenThreadPool from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, NamedTuple, cast +from typing import Callable, NamedTuple, cast import PIL @@ -39,7 +40,7 @@ from ocrmypdf._pipeline import ( rasterize_preview, should_visible_page_image_use_jpg, ) -from ocrmypdf._plugin_manager import OcrmypdfPluginManager, get_plugin_manager +from ocrmypdf._plugin_manager import OcrmypdfPluginManager from ocrmypdf._validation import ( report_output_file_size, ) @@ -53,11 +54,12 @@ from ocrmypdf.helpers import ( from ocrmypdf.pdfa import file_claims_pdfa log = logging.getLogger(__name__) +tls = threading.local() +tls.pageno = None - -def set_logging_tls(tls): +def _set_logging_tls(tls): + """Inject current page number (when available) into log records.""" old_factory = logging.getLogRecordFactory() - def wrapper(*args, **kwargs): record = old_factory(*args, **kwargs) if hasattr(tls, 'pageno'): @@ -67,6 +69,14 @@ def set_logging_tls(tls): logging.setLogRecordFactory(wrapper) +_set_logging_tls(tls) + + +def set_thread_pageno(pageno:int): + """Set page number (1-based) that the current thread is processing.""" + tls.pageno = pageno + + class PageResult(NamedTuple): """Result when a page is finished processing.""" diff --git a/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py b/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py index 4fb545ba..f3b5d47a 100644 --- a/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py +++ b/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py @@ -10,8 +10,6 @@ from __future__ import annotations import argparse import logging import logging.handlers -import shutil -import threading from collections.abc import Sequence from functools import partial @@ -24,14 +22,13 @@ from ocrmypdf._pipeline import ( copy_final, get_pdfinfo, render_hocr_page, - validate_pdfinfo_options, ) from ocrmypdf._pipelines._common import ( HOCRResult, manage_work_folder, postprocess, report_output_pdf, - set_logging_tls, + set_thread_pageno, setup_pipeline, worker_init, ) @@ -41,12 +38,6 @@ from ocrmypdf.exceptions import ExitCode log = logging.getLogger(__name__) -tls = threading.local() -tls.pageno = None - -set_logging_tls(tls) - - def exec_hocrtransform_sync(page_context: PageContext) -> HOCRResult: hocr_result = HOCRResult.from_json(page_context.get_path('hocr.json').read_text()) hocr_result.textpdf = render_hocr_page( @@ -68,7 +59,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st def graft_page(result: HOCRResult, pbar): """After OCR is complete for a page, update the PDF.""" try: - tls.pageno = result.pageno + 1 + set_thread_pageno(result.pageno + 1) pbar.update() ocrgraft.graft_page( pageno=result.pageno, @@ -78,7 +69,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st ) pbar.update() finally: - tls.pageno = None + set_thread_pageno(None) executor( use_threads=options.use_threads, diff --git a/src/ocrmypdf/_pipelines/ocr.py b/src/ocrmypdf/_pipelines/ocr.py index bdf1b7e4..c5587acf 100644 --- a/src/ocrmypdf/_pipelines/ocr.py +++ b/src/ocrmypdf/_pipelines/ocr.py @@ -40,7 +40,7 @@ from ocrmypdf._pipelines._common import ( postprocess, process_page, report_output_pdf, - set_logging_tls, + set_thread_pageno, setup_pipeline, worker_init, ) @@ -54,12 +54,6 @@ from ocrmypdf.exceptions import ExitCode log = logging.getLogger(__name__) -tls = threading.local() -tls.pageno = None - -set_logging_tls(tls) - - def _image_to_ocr_text( page_context: PageContext, ocr_image_out: Path ) -> tuple[Path, Path]: @@ -77,7 +71,7 @@ def _image_to_ocr_text( def exec_page_sync(page_context: PageContext) -> PageResult: """Execute a pipeline for a single page synchronously.""" - tls.pageno = page_context.pageno + 1 + set_thread_pageno(page_context.pageno + 1) if not is_ocr_required(page_context): return PageResult(pageno=page_context.pageno) @@ -110,7 +104,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]: def update_page(result: PageResult, pbar): """After OCR is complete for a page, update the PDF.""" try: - tls.pageno = result.pageno + 1 + set_thread_pageno(result.pageno + 1) sidecars[result.pageno] = result.text pbar.update() ocrgraft.graft_page( @@ -121,7 +115,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]: ) pbar.update() finally: - tls.pageno = None + set_thread_pageno(None) executor( use_threads=options.use_threads, diff --git a/src/ocrmypdf/_pipelines/pdf_to_hocr.py b/src/ocrmypdf/_pipelines/pdf_to_hocr.py index 45f9af3a..71da68d4 100644 --- a/src/ocrmypdf/_pipelines/pdf_to_hocr.py +++ b/src/ocrmypdf/_pipelines/pdf_to_hocr.py @@ -28,7 +28,7 @@ from ocrmypdf._pipelines._common import ( HOCRResult, manage_work_folder, process_page, - set_logging_tls, + set_thread_pageno, setup_pipeline, worker_init, ) @@ -39,15 +39,11 @@ from ocrmypdf._validation import ( log = logging.getLogger(__name__) -tls = threading.local() -tls.pageno = None - -set_logging_tls(tls) def exec_page_hocr_sync(page_context: PageContext) -> HOCRResult: """Execute a pipeline for a single page hOCR.""" - tls.pageno = page_context.pageno + 1 + set_thread_pageno(page_context.pageno + 1) if not is_ocr_required(page_context): return HOCRResult(pageno=page_context.pageno)