From d2dbea6cf832646428f8c5faf671bc79c446fa0b Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 19 Oct 2023 00:42:10 -0700 Subject: [PATCH] Reorganize progress bars so they can be typed properly --- docs/plugins.rst | 3 + src/ocrmypdf/_concurrent.py | 26 +--- src/ocrmypdf/_logging.py | 62 --------- src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py | 3 +- src/ocrmypdf/_pipelines/ocr.py | 3 +- src/ocrmypdf/_progressbar.py | 135 ++++++++++++++++++++ src/ocrmypdf/builtin_plugins/concurrency.py | 3 +- src/ocrmypdf/optimize.py | 5 +- src/ocrmypdf/pdfinfo/info.py | 11 +- src/ocrmypdf/pluginspec.py | 57 +-------- 10 files changed, 162 insertions(+), 146 deletions(-) create mode 100644 src/ocrmypdf/_progressbar.py diff --git a/docs/plugins.rst b/docs/plugins.rst index da8df5ac..e3bb5ffc 100644 --- a/docs/plugins.rst +++ b/docs/plugins.rst @@ -177,6 +177,9 @@ Custom command line arguments Execution and progress reporting -------------------------------- +.. autoclass:: ocrmypdf.pluginspec.ProgressBar + :members: + .. autoclass:: ocrmypdf.pluginspec.Executor :members: diff --git a/src/ocrmypdf/_concurrent.py b/src/ocrmypdf/_concurrent.py index 9e10a16e..5f27b9cf 100644 --- a/src/ocrmypdf/_concurrent.py +++ b/src/ocrmypdf/_concurrent.py @@ -8,29 +8,17 @@ from __future__ import annotations import threading from abc import ABC, abstractmethod from collections.abc import Iterable -from typing import Callable +from typing import Callable, TypeVar + +from ocrmypdf._progressbar import NullProgressBar, ProgressBar + +T = TypeVar('T') def _task_noop(*_args, **_kwargs): return -class NullProgressBar: - """Progress bar API that takes no actions.""" - - def __init__(self, **kwargs): - pass - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - return False - - def update(self, _arg=None): - return - - class Executor(ABC): """Abstract concurrent executor.""" @@ -48,9 +36,9 @@ class Executor(ABC): max_workers: int, progress_kwargs: dict, worker_initializer: Callable | None = None, - task: Callable | None = None, + task: Callable[..., T] | None = None, task_arguments: Iterable | None = None, - task_finished: Callable | None = None, + task_finished: Callable[[T, ProgressBar], None] | None = None, ) -> None: """Set up parallel execution and progress reporting. diff --git a/src/ocrmypdf/_logging.py b/src/ocrmypdf/_logging.py index acab479d..45b599b7 100644 --- a/src/ocrmypdf/_logging.py +++ b/src/ocrmypdf/_logging.py @@ -9,15 +9,6 @@ import logging from rich.console import Console from rich.logging import RichHandler -from rich.progress import ( - BarColumn, - MofNCompleteColumn, - Progress, - TaskProgressColumn, - TextColumn, - TimeRemainingColumn, -) -from rich.table import Column class PageNumberFilter(logging.Filter): @@ -37,56 +28,3 @@ class RichLoggingHandler(RichHandler): super().__init__( console=console, show_level=False, show_time=False, markup=True, **kwargs ) - - -class RichProgressBar: - """Display progress bar using rich.""" - - def __init__( - self, - *, - console: Console, - desc: str, - total: float | None = None, - unit: str | None = None, - unit_scale: float | None = 1.0, - disable: bool = False, - **kwargs, - ): - self.progress = Progress( - TextColumn( - "[progress.description]{task.description}", - table_column=Column(min_width=20), - ), - BarColumn(), - TaskProgressColumn(), - MofNCompleteColumn(), - TimeRemainingColumn(), - console=console, - auto_refresh=True, - redirect_stderr=True, - redirect_stdout=False, - disable=disable, - **kwargs, - ) - self.unit_scale = unit_scale - self.progress_bar = self.progress.add_task( - desc, - total=total * self.unit_scale - if total is not None and self.unit_scale is not None - else None, - unit=unit, - ) - - def __enter__(self): - self.progress.start() - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.progress.refresh() - self.progress.stop() - return False - - def update(self, value=None): - advance = self.unit_scale if value is None else value - self.progress.update(self.progress_bar, advance=advance) diff --git a/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py b/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py index 200cc366..7781b5a2 100644 --- a/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py +++ b/src/ocrmypdf/_pipelines/hocr_to_ocr_pdf.py @@ -33,6 +33,7 @@ from ocrmypdf._pipelines._common import ( worker_init, ) from ocrmypdf._plugin_manager import OcrmypdfPluginManager +from ocrmypdf._progressbar import ProgressBar from ocrmypdf.exceptions import ExitCode log = logging.getLogger(__name__) @@ -60,7 +61,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st ocrgraft = OcrGrafter(context) - def graft_page(result: HOCRResult, pbar): + def graft_page(result: HOCRResult, pbar: ProgressBar): """After OCR is complete for a page, update the PDF.""" try: set_thread_pageno(result.pageno + 1) diff --git a/src/ocrmypdf/_pipelines/ocr.py b/src/ocrmypdf/_pipelines/ocr.py index dacce069..1b6fbc97 100644 --- a/src/ocrmypdf/_pipelines/ocr.py +++ b/src/ocrmypdf/_pipelines/ocr.py @@ -44,6 +44,7 @@ from ocrmypdf._pipelines._common import ( worker_init, ) from ocrmypdf._plugin_manager import OcrmypdfPluginManager +from ocrmypdf._progressbar import ProgressBar from ocrmypdf._validation import ( check_requested_output_file, create_input_file, @@ -100,7 +101,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]: sidecars: list[Path | None] = [None] * len(context.pdfinfo) ocrgraft = OcrGrafter(context) - def update_page(result: PageResult, pbar): + def update_page(result: PageResult, pbar: ProgressBar): """After OCR is complete for a page, update the PDF.""" try: set_thread_pageno(result.pageno + 1) diff --git a/src/ocrmypdf/_progressbar.py b/src/ocrmypdf/_progressbar.py new file mode 100644 index 00000000..18f4c691 --- /dev/null +++ b/src/ocrmypdf/_progressbar.py @@ -0,0 +1,135 @@ +from typing import Protocol + +from rich.console import Console +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + TaskProgressColumn, + TextColumn, + TimeRemainingColumn, +) +from rich.table import Column + + +class ProgressBar(Protocol): + """The protocol that OCRmyPDF expects progress bar classes to be compatible with. + + In practice this could be used for any time of monitoring, not just a progress bar. + + Calling the class should return a new progress bar object, which is activated + with ``__enter__`` and terminated with ``__exit__``. An update method is called + whenever the progress bar is updated. Progress bar objects will not be reused; + a new one will be created for each group of tasks. + + The progress bar is held in the main process/thread and not updated by child + process/threads. When a child notifies the parent of completed work, the + parent updates the progress bar. + + Progress bars should never write to ``sys.stdout``, or they will corrupt the + output if OCRmyPDF writes a PDF to standard output. + + The type of events that OCRmyPDF reports to a progress bar may change in + minor releases. + """ + + def __init__( + self, + *, + total: int | float | None, + desc: str | None, + unit: str | None, + disable: bool = False, + **kwargs, + ): + """Initialize a progress bar. + + *total* indicates the total number of work units. If None, the total + number of work units is unknown. If *disable* is True, the progress bar + should be disabled. *unit* is a description of the work unit. + *desc* is a description of the overall task to be performed. + + Unrecognized keyword arguments must be ignored, as the list of keyword + arguments may grow with time. + """ + + def __enter__(self): + """Enter a progress bar context.""" + + def __exit__(self, *args): + """Exit a progress bar context.""" + + def update(self, n=1): + """Update the progress bar by an increment. + + For use within a progress bar context. + """ + + +class NullProgressBar: + """Progress bar API that takes no actions.""" + + def __init__(self, **kwargs): + pass + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + return False + + def update(self, _arg=None): + return + + +class RichProgressBar: + """Display progress bar using rich.""" + + def __init__( + self, + *, + console: Console, + desc: str, + total: float | None = None, + unit: str | None = None, + unit_scale: float | None = 1.0, + disable: bool = False, + **kwargs, + ): + self.progress = Progress( + TextColumn( + "[progress.description]{task.description}", + table_column=Column(min_width=20), + ), + BarColumn(), + TaskProgressColumn(), + MofNCompleteColumn(), + TimeRemainingColumn(), + console=console, + auto_refresh=True, + redirect_stderr=True, + redirect_stdout=False, + disable=disable, + **kwargs, + ) + self.unit_scale = unit_scale + self.progress_bar = self.progress.add_task( + desc, + total=total * self.unit_scale + if total is not None and self.unit_scale is not None + else None, + unit=unit, + ) + + def __enter__(self): + self.progress.start() + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.progress.refresh() + self.progress.stop() + return False + + def update(self, value=None): + advance = self.unit_scale if value is None else value + self.progress.update(self.progress_bar, advance=advance) diff --git a/src/ocrmypdf/builtin_plugins/concurrency.py b/src/ocrmypdf/builtin_plugins/concurrency.py index e2b80a59..d9a072c9 100644 --- a/src/ocrmypdf/builtin_plugins/concurrency.py +++ b/src/ocrmypdf/builtin_plugins/concurrency.py @@ -20,7 +20,8 @@ from typing import Callable, Union from rich.console import Console as RichConsole from ocrmypdf import Executor, hookimpl -from ocrmypdf._logging import RichLoggingHandler, RichProgressBar +from ocrmypdf._logging import RichLoggingHandler +from ocrmypdf._progressbar import RichProgressBar from ocrmypdf.exceptions import InputFileError from ocrmypdf.helpers import remove_all_log_handlers diff --git a/src/ocrmypdf/optimize.py b/src/ocrmypdf/optimize.py index 9bdd54b1..388e1505 100644 --- a/src/ocrmypdf/optimize.py +++ b/src/ocrmypdf/optimize.py @@ -35,6 +35,7 @@ from PIL import Image from ocrmypdf._concurrent import Executor, SerialExecutor from ocrmypdf._exec import jbig2enc, pngquant from ocrmypdf._jobcontext import PdfContext +from ocrmypdf._progressbar import ProgressBar from ocrmypdf.exceptions import OutputFileAccessError from ocrmypdf.helpers import IMG2PDF_KWARGS, safe_symlink @@ -478,7 +479,7 @@ def transcode_jpegs( opt_jpg = in_jpg.with_suffix('.opt.jpg') yield xref, in_jpg, opt_jpg, options.jpeg_quality - def finish_jpeg(result: tuple[Xref, Path | None], pbar): + def finish_jpeg(result: tuple[Xref, Path | None], pbar: ProgressBar): xref, opt_jpg = result if opt_jpg: compdata = opt_jpg.read_bytes() # JPEG can inserted into PDF as is @@ -552,7 +553,7 @@ def deflate_jpegs(pdf: Pdf, root: Path, options, executor: Executor) -> None: for xref in jpegs: yield pdf, lock, xref, complevel - def finish(result, pbar): + def finish(result: tuple[Xref, bytes], pbar: ProgressBar): xref, compdata = result if len(compdata) > 0: with lock: diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index 7815ad21..94f01c9e 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -38,6 +38,7 @@ from pikepdf import ( ) from ocrmypdf._concurrent import Executor, SerialExecutor +from ocrmypdf._progressbar import ProgressBar from ocrmypdf.exceptions import EncryptedPdfError, InputFileError from ocrmypdf.helpers import Resolution, available_cpu_count, pikepdf_enable_mmap from ocrmypdf.pdfinfo.layout import LTStateAwareChar, get_page_analysis, get_text_boxes @@ -694,13 +695,14 @@ def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel): atexit.register(on_process_close) -def _pdf_pageinfo_sync(pageno, thread_pdf, infile, check_pages, detailed_analysis): +def _pdf_pageinfo_sync( + pageno, thread_pdf, infile, check_pages, detailed_analysis +) -> PageInfo: pdf = thread_pdf if thread_pdf is not None else worker_pdf with ExitStack() as stack: if not pdf: # When called with SerialExecutor pdf = stack.enter_context(Pdf.open(infile)) - page = PageInfo(pdf, pageno, infile, check_pages, detailed_analysis) - return page + return PageInfo(pdf, pageno, infile, check_pages, detailed_analysis) def _pdf_pageinfo_concurrent( @@ -715,8 +717,7 @@ def _pdf_pageinfo_concurrent( ) -> Sequence[PageInfo | None]: pages: Sequence[PageInfo | None] = [None] * len(pdf.pages) - def update_pageinfo(result, pbar): - page = result + def update_pageinfo(page: PageInfo, pbar: ProgressBar): if not page: raise InputFileError("Could read a page in the PDF") pages[page.pageno] = page diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index dbe25f48..13ac2311 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -10,11 +10,12 @@ from argparse import ArgumentParser, Namespace from collections.abc import Sequence, Set from logging import Handler from pathlib import Path -from typing import TYPE_CHECKING, NamedTuple, Protocol +from typing import TYPE_CHECKING, NamedTuple import pluggy from ocrmypdf import Executor, PdfContext +from ocrmypdf._progressbar import ProgressBar from ocrmypdf.helpers import Resolution if TYPE_CHECKING: @@ -108,60 +109,6 @@ def check_options(options: Namespace) -> None: """ -class ProgressBar(Protocol): - """The protocol that OCRmyPDF expects progress bar classes to be compatible with. - - In practice this could be used for any time of monitoring, not just a progress bar. - - Calling the class should return a new progress bar object, which is activated - with ``__enter__`` and terminated with ``__exit__``. An update method is called - whenever the progress bar is updated. Progress bar objects will not be reused; - a new one will be created for each group of tasks. - - The progress bar is held in the main process/thread and not updated by child - process/threads. When a child notifies the parent of completed work, the - parent updates the progress bar. - - Progress bars should never write to ``sys.stdout``, or they will corrupt the - output if OCRmyPDF writes a PDF to standard output. - - The type of events that OCRmyPDF reports to a progress bar may change in - minor releases. - """ - - def __init__( - self, - *, - total: int | float | None, - desc: str | None, - unit: str | None, - disable: bool = False, - **kwargs, - ): - """Initialize a progress bar. - - *total* indicates the total number of work units. If None, the total - number of work units is unknown. If *disable* is True, the progress bar - should be disabled. *unit* is a description of the work unit. - *desc* is a description of the overall task to be performed. - - Unrecognized keyword arguments must be ignored, as the list of keyword - arguments may grow with time. - """ - - def __enter__(self): - """Enter a progress bar context.""" - - def __exit__(self, *args): - """Exit a progress bar context.""" - - def update(self, n=1): - """Update the progress bar by an increment. - - For use within a progress bar context. - """ - - @hookspec(firstresult=True) def get_executor(progressbar_class: type[ProgressBar]) -> Executor: """Called to obtain an object that manages parallel execution.