Remove duplicate thread local storage of page numbers

This commit is contained in:
James R. Barlow
2023-10-14 20:39:29 -07:00
parent e400112f32
commit 62c4f65fc3
4 changed files with 24 additions and 33 deletions

View File

@@ -10,13 +10,14 @@ import logging.handlers
import os
import shutil
import sys
import threading
from collections.abc import Sequence
from concurrent.futures.process import BrokenProcessPool
from concurrent.futures.thread import BrokenThreadPool
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, NamedTuple, cast
from typing import Callable, NamedTuple, cast
import PIL
@@ -39,7 +40,7 @@ from ocrmypdf._pipeline import (
rasterize_preview,
should_visible_page_image_use_jpg,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager, get_plugin_manager
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._validation import (
report_output_file_size,
)
@@ -53,11 +54,12 @@ from ocrmypdf.helpers import (
from ocrmypdf.pdfa import file_claims_pdfa
log = logging.getLogger(__name__)
tls = threading.local()
tls.pageno = None
def set_logging_tls(tls):
def _set_logging_tls(tls):
"""Inject current page number (when available) into log records."""
old_factory = logging.getLogRecordFactory()
def wrapper(*args, **kwargs):
record = old_factory(*args, **kwargs)
if hasattr(tls, 'pageno'):
@@ -67,6 +69,14 @@ def set_logging_tls(tls):
logging.setLogRecordFactory(wrapper)
_set_logging_tls(tls)
def set_thread_pageno(pageno:int):
"""Set page number (1-based) that the current thread is processing."""
tls.pageno = pageno
class PageResult(NamedTuple):
"""Result when a page is finished processing."""

View File

@@ -10,8 +10,6 @@ from __future__ import annotations
import argparse
import logging
import logging.handlers
import shutil
import threading
from collections.abc import Sequence
from functools import partial
@@ -24,14 +22,13 @@ from ocrmypdf._pipeline import (
copy_final,
get_pdfinfo,
render_hocr_page,
validate_pdfinfo_options,
)
from ocrmypdf._pipelines._common import (
HOCRResult,
manage_work_folder,
postprocess,
report_output_pdf,
set_logging_tls,
set_thread_pageno,
setup_pipeline,
worker_init,
)
@@ -41,12 +38,6 @@ from ocrmypdf.exceptions import ExitCode
log = logging.getLogger(__name__)
tls = threading.local()
tls.pageno = None
set_logging_tls(tls)
def exec_hocrtransform_sync(page_context: PageContext) -> HOCRResult:
hocr_result = HOCRResult.from_json(page_context.get_path('hocr.json').read_text())
hocr_result.textpdf = render_hocr_page(
@@ -68,7 +59,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st
def graft_page(result: HOCRResult, pbar):
"""After OCR is complete for a page, update the PDF."""
try:
tls.pageno = result.pageno + 1
set_thread_pageno(result.pageno + 1)
pbar.update()
ocrgraft.graft_page(
pageno=result.pageno,
@@ -78,7 +69,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st
)
pbar.update()
finally:
tls.pageno = None
set_thread_pageno(None)
executor(
use_threads=options.use_threads,

View File

@@ -40,7 +40,7 @@ from ocrmypdf._pipelines._common import (
postprocess,
process_page,
report_output_pdf,
set_logging_tls,
set_thread_pageno,
setup_pipeline,
worker_init,
)
@@ -54,12 +54,6 @@ from ocrmypdf.exceptions import ExitCode
log = logging.getLogger(__name__)
tls = threading.local()
tls.pageno = None
set_logging_tls(tls)
def _image_to_ocr_text(
page_context: PageContext, ocr_image_out: Path
) -> tuple[Path, Path]:
@@ -77,7 +71,7 @@ def _image_to_ocr_text(
def exec_page_sync(page_context: PageContext) -> PageResult:
"""Execute a pipeline for a single page synchronously."""
tls.pageno = page_context.pageno + 1
set_thread_pageno(page_context.pageno + 1)
if not is_ocr_required(page_context):
return PageResult(pageno=page_context.pageno)
@@ -110,7 +104,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
def update_page(result: PageResult, pbar):
"""After OCR is complete for a page, update the PDF."""
try:
tls.pageno = result.pageno + 1
set_thread_pageno(result.pageno + 1)
sidecars[result.pageno] = result.text
pbar.update()
ocrgraft.graft_page(
@@ -121,7 +115,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
)
pbar.update()
finally:
tls.pageno = None
set_thread_pageno(None)
executor(
use_threads=options.use_threads,

View File

@@ -28,7 +28,7 @@ from ocrmypdf._pipelines._common import (
HOCRResult,
manage_work_folder,
process_page,
set_logging_tls,
set_thread_pageno,
setup_pipeline,
worker_init,
)
@@ -39,15 +39,11 @@ from ocrmypdf._validation import (
log = logging.getLogger(__name__)
tls = threading.local()
tls.pageno = None
set_logging_tls(tls)
def exec_page_hocr_sync(page_context: PageContext) -> HOCRResult:
"""Execute a pipeline for a single page hOCR."""
tls.pageno = page_context.pageno + 1
set_thread_pageno(page_context.pageno + 1)
if not is_ocr_required(page_context):
return HOCRResult(pageno=page_context.pageno)