mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
Remove duplicate thread local storage of page numbers
This commit is contained in:
@@ -10,13 +10,14 @@ import logging.handlers
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
import threading
|
||||
from collections.abc import Sequence
|
||||
from concurrent.futures.process import BrokenProcessPool
|
||||
from concurrent.futures.thread import BrokenThreadPool
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, NamedTuple, cast
|
||||
from typing import Callable, NamedTuple, cast
|
||||
|
||||
import PIL
|
||||
|
||||
@@ -39,7 +40,7 @@ from ocrmypdf._pipeline import (
|
||||
rasterize_preview,
|
||||
should_visible_page_image_use_jpg,
|
||||
)
|
||||
from ocrmypdf._plugin_manager import OcrmypdfPluginManager, get_plugin_manager
|
||||
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
|
||||
from ocrmypdf._validation import (
|
||||
report_output_file_size,
|
||||
)
|
||||
@@ -53,11 +54,12 @@ from ocrmypdf.helpers import (
|
||||
from ocrmypdf.pdfa import file_claims_pdfa
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
tls = threading.local()
|
||||
tls.pageno = None
|
||||
|
||||
|
||||
def set_logging_tls(tls):
|
||||
def _set_logging_tls(tls):
|
||||
"""Inject current page number (when available) into log records."""
|
||||
old_factory = logging.getLogRecordFactory()
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
record = old_factory(*args, **kwargs)
|
||||
if hasattr(tls, 'pageno'):
|
||||
@@ -67,6 +69,14 @@ def set_logging_tls(tls):
|
||||
logging.setLogRecordFactory(wrapper)
|
||||
|
||||
|
||||
_set_logging_tls(tls)
|
||||
|
||||
|
||||
def set_thread_pageno(pageno:int):
|
||||
"""Set page number (1-based) that the current thread is processing."""
|
||||
tls.pageno = pageno
|
||||
|
||||
|
||||
class PageResult(NamedTuple):
|
||||
"""Result when a page is finished processing."""
|
||||
|
||||
|
||||
@@ -10,8 +10,6 @@ from __future__ import annotations
|
||||
import argparse
|
||||
import logging
|
||||
import logging.handlers
|
||||
import shutil
|
||||
import threading
|
||||
from collections.abc import Sequence
|
||||
from functools import partial
|
||||
|
||||
@@ -24,14 +22,13 @@ from ocrmypdf._pipeline import (
|
||||
copy_final,
|
||||
get_pdfinfo,
|
||||
render_hocr_page,
|
||||
validate_pdfinfo_options,
|
||||
)
|
||||
from ocrmypdf._pipelines._common import (
|
||||
HOCRResult,
|
||||
manage_work_folder,
|
||||
postprocess,
|
||||
report_output_pdf,
|
||||
set_logging_tls,
|
||||
set_thread_pageno,
|
||||
setup_pipeline,
|
||||
worker_init,
|
||||
)
|
||||
@@ -41,12 +38,6 @@ from ocrmypdf.exceptions import ExitCode
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
tls = threading.local()
|
||||
tls.pageno = None
|
||||
|
||||
set_logging_tls(tls)
|
||||
|
||||
|
||||
def exec_hocrtransform_sync(page_context: PageContext) -> HOCRResult:
|
||||
hocr_result = HOCRResult.from_json(page_context.get_path('hocr.json').read_text())
|
||||
hocr_result.textpdf = render_hocr_page(
|
||||
@@ -68,7 +59,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st
|
||||
def graft_page(result: HOCRResult, pbar):
|
||||
"""After OCR is complete for a page, update the PDF."""
|
||||
try:
|
||||
tls.pageno = result.pageno + 1
|
||||
set_thread_pageno(result.pageno + 1)
|
||||
pbar.update()
|
||||
ocrgraft.graft_page(
|
||||
pageno=result.pageno,
|
||||
@@ -78,7 +69,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st
|
||||
)
|
||||
pbar.update()
|
||||
finally:
|
||||
tls.pageno = None
|
||||
set_thread_pageno(None)
|
||||
|
||||
executor(
|
||||
use_threads=options.use_threads,
|
||||
|
||||
@@ -40,7 +40,7 @@ from ocrmypdf._pipelines._common import (
|
||||
postprocess,
|
||||
process_page,
|
||||
report_output_pdf,
|
||||
set_logging_tls,
|
||||
set_thread_pageno,
|
||||
setup_pipeline,
|
||||
worker_init,
|
||||
)
|
||||
@@ -54,12 +54,6 @@ from ocrmypdf.exceptions import ExitCode
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
tls = threading.local()
|
||||
tls.pageno = None
|
||||
|
||||
set_logging_tls(tls)
|
||||
|
||||
|
||||
def _image_to_ocr_text(
|
||||
page_context: PageContext, ocr_image_out: Path
|
||||
) -> tuple[Path, Path]:
|
||||
@@ -77,7 +71,7 @@ def _image_to_ocr_text(
|
||||
|
||||
def exec_page_sync(page_context: PageContext) -> PageResult:
|
||||
"""Execute a pipeline for a single page synchronously."""
|
||||
tls.pageno = page_context.pageno + 1
|
||||
set_thread_pageno(page_context.pageno + 1)
|
||||
|
||||
if not is_ocr_required(page_context):
|
||||
return PageResult(pageno=page_context.pageno)
|
||||
@@ -110,7 +104,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
|
||||
def update_page(result: PageResult, pbar):
|
||||
"""After OCR is complete for a page, update the PDF."""
|
||||
try:
|
||||
tls.pageno = result.pageno + 1
|
||||
set_thread_pageno(result.pageno + 1)
|
||||
sidecars[result.pageno] = result.text
|
||||
pbar.update()
|
||||
ocrgraft.graft_page(
|
||||
@@ -121,7 +115,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
|
||||
)
|
||||
pbar.update()
|
||||
finally:
|
||||
tls.pageno = None
|
||||
set_thread_pageno(None)
|
||||
|
||||
executor(
|
||||
use_threads=options.use_threads,
|
||||
|
||||
@@ -28,7 +28,7 @@ from ocrmypdf._pipelines._common import (
|
||||
HOCRResult,
|
||||
manage_work_folder,
|
||||
process_page,
|
||||
set_logging_tls,
|
||||
set_thread_pageno,
|
||||
setup_pipeline,
|
||||
worker_init,
|
||||
)
|
||||
@@ -39,15 +39,11 @@ from ocrmypdf._validation import (
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
tls = threading.local()
|
||||
tls.pageno = None
|
||||
|
||||
set_logging_tls(tls)
|
||||
|
||||
|
||||
def exec_page_hocr_sync(page_context: PageContext) -> HOCRResult:
|
||||
"""Execute a pipeline for a single page hOCR."""
|
||||
tls.pageno = page_context.pageno + 1
|
||||
set_thread_pageno(page_context.pageno + 1)
|
||||
|
||||
if not is_ocr_required(page_context):
|
||||
return HOCRResult(pageno=page_context.pageno)
|
||||
|
||||
Reference in New Issue
Block a user