Reorganize progress bars so they can be typed properly

This commit is contained in:
James R. Barlow
2023-10-19 00:42:10 -07:00
parent 46a279a49a
commit d2dbea6cf8
10 changed files with 162 additions and 146 deletions

View File

@@ -177,6 +177,9 @@ Custom command line arguments
Execution and progress reporting
--------------------------------
.. autoclass:: ocrmypdf.pluginspec.ProgressBar
:members:
.. autoclass:: ocrmypdf.pluginspec.Executor
:members:

View File

@@ -8,29 +8,17 @@ from __future__ import annotations
import threading
from abc import ABC, abstractmethod
from collections.abc import Iterable
from typing import Callable
from typing import Callable, TypeVar
from ocrmypdf._progressbar import NullProgressBar, ProgressBar
T = TypeVar('T')
def _task_noop(*_args, **_kwargs):
return
class NullProgressBar:
"""Progress bar API that takes no actions."""
def __init__(self, **kwargs):
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
return False
def update(self, _arg=None):
return
class Executor(ABC):
"""Abstract concurrent executor."""
@@ -48,9 +36,9 @@ class Executor(ABC):
max_workers: int,
progress_kwargs: dict,
worker_initializer: Callable | None = None,
task: Callable | None = None,
task: Callable[..., T] | None = None,
task_arguments: Iterable | None = None,
task_finished: Callable | None = None,
task_finished: Callable[[T, ProgressBar], None] | None = None,
) -> None:
"""Set up parallel execution and progress reporting.

View File

@@ -9,15 +9,6 @@ import logging
from rich.console import Console
from rich.logging import RichHandler
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
TaskProgressColumn,
TextColumn,
TimeRemainingColumn,
)
from rich.table import Column
class PageNumberFilter(logging.Filter):
@@ -37,56 +28,3 @@ class RichLoggingHandler(RichHandler):
super().__init__(
console=console, show_level=False, show_time=False, markup=True, **kwargs
)
class RichProgressBar:
"""Display progress bar using rich."""
def __init__(
self,
*,
console: Console,
desc: str,
total: float | None = None,
unit: str | None = None,
unit_scale: float | None = 1.0,
disable: bool = False,
**kwargs,
):
self.progress = Progress(
TextColumn(
"[progress.description]{task.description}",
table_column=Column(min_width=20),
),
BarColumn(),
TaskProgressColumn(),
MofNCompleteColumn(),
TimeRemainingColumn(),
console=console,
auto_refresh=True,
redirect_stderr=True,
redirect_stdout=False,
disable=disable,
**kwargs,
)
self.unit_scale = unit_scale
self.progress_bar = self.progress.add_task(
desc,
total=total * self.unit_scale
if total is not None and self.unit_scale is not None
else None,
unit=unit,
)
def __enter__(self):
self.progress.start()
return self
def __exit__(self, exc_type, exc_value, traceback):
self.progress.refresh()
self.progress.stop()
return False
def update(self, value=None):
advance = self.unit_scale if value is None else value
self.progress.update(self.progress_bar, advance=advance)

View File

@@ -33,6 +33,7 @@ from ocrmypdf._pipelines._common import (
worker_init,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import ExitCode
log = logging.getLogger(__name__)
@@ -60,7 +61,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st
ocrgraft = OcrGrafter(context)
def graft_page(result: HOCRResult, pbar):
def graft_page(result: HOCRResult, pbar: ProgressBar):
"""After OCR is complete for a page, update the PDF."""
try:
set_thread_pageno(result.pageno + 1)

View File

@@ -44,6 +44,7 @@ from ocrmypdf._pipelines._common import (
worker_init,
)
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf._validation import (
check_requested_output_file,
create_input_file,
@@ -100,7 +101,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
sidecars: list[Path | None] = [None] * len(context.pdfinfo)
ocrgraft = OcrGrafter(context)
def update_page(result: PageResult, pbar):
def update_page(result: PageResult, pbar: ProgressBar):
"""After OCR is complete for a page, update the PDF."""
try:
set_thread_pageno(result.pageno + 1)

View File

@@ -0,0 +1,135 @@
from typing import Protocol
from rich.console import Console
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
TaskProgressColumn,
TextColumn,
TimeRemainingColumn,
)
from rich.table import Column
class ProgressBar(Protocol):
"""The protocol that OCRmyPDF expects progress bar classes to be compatible with.
In practice this could be used for any time of monitoring, not just a progress bar.
Calling the class should return a new progress bar object, which is activated
with ``__enter__`` and terminated with ``__exit__``. An update method is called
whenever the progress bar is updated. Progress bar objects will not be reused;
a new one will be created for each group of tasks.
The progress bar is held in the main process/thread and not updated by child
process/threads. When a child notifies the parent of completed work, the
parent updates the progress bar.
Progress bars should never write to ``sys.stdout``, or they will corrupt the
output if OCRmyPDF writes a PDF to standard output.
The type of events that OCRmyPDF reports to a progress bar may change in
minor releases.
"""
def __init__(
self,
*,
total: int | float | None,
desc: str | None,
unit: str | None,
disable: bool = False,
**kwargs,
):
"""Initialize a progress bar.
*total* indicates the total number of work units. If None, the total
number of work units is unknown. If *disable* is True, the progress bar
should be disabled. *unit* is a description of the work unit.
*desc* is a description of the overall task to be performed.
Unrecognized keyword arguments must be ignored, as the list of keyword
arguments may grow with time.
"""
def __enter__(self):
"""Enter a progress bar context."""
def __exit__(self, *args):
"""Exit a progress bar context."""
def update(self, n=1):
"""Update the progress bar by an increment.
For use within a progress bar context.
"""
class NullProgressBar:
"""Progress bar API that takes no actions."""
def __init__(self, **kwargs):
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
return False
def update(self, _arg=None):
return
class RichProgressBar:
"""Display progress bar using rich."""
def __init__(
self,
*,
console: Console,
desc: str,
total: float | None = None,
unit: str | None = None,
unit_scale: float | None = 1.0,
disable: bool = False,
**kwargs,
):
self.progress = Progress(
TextColumn(
"[progress.description]{task.description}",
table_column=Column(min_width=20),
),
BarColumn(),
TaskProgressColumn(),
MofNCompleteColumn(),
TimeRemainingColumn(),
console=console,
auto_refresh=True,
redirect_stderr=True,
redirect_stdout=False,
disable=disable,
**kwargs,
)
self.unit_scale = unit_scale
self.progress_bar = self.progress.add_task(
desc,
total=total * self.unit_scale
if total is not None and self.unit_scale is not None
else None,
unit=unit,
)
def __enter__(self):
self.progress.start()
return self
def __exit__(self, exc_type, exc_value, traceback):
self.progress.refresh()
self.progress.stop()
return False
def update(self, value=None):
advance = self.unit_scale if value is None else value
self.progress.update(self.progress_bar, advance=advance)

View File

@@ -20,7 +20,8 @@ from typing import Callable, Union
from rich.console import Console as RichConsole
from ocrmypdf import Executor, hookimpl
from ocrmypdf._logging import RichLoggingHandler, RichProgressBar
from ocrmypdf._logging import RichLoggingHandler
from ocrmypdf._progressbar import RichProgressBar
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import remove_all_log_handlers

View File

@@ -35,6 +35,7 @@ from PIL import Image
from ocrmypdf._concurrent import Executor, SerialExecutor
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import OutputFileAccessError
from ocrmypdf.helpers import IMG2PDF_KWARGS, safe_symlink
@@ -478,7 +479,7 @@ def transcode_jpegs(
opt_jpg = in_jpg.with_suffix('.opt.jpg')
yield xref, in_jpg, opt_jpg, options.jpeg_quality
def finish_jpeg(result: tuple[Xref, Path | None], pbar):
def finish_jpeg(result: tuple[Xref, Path | None], pbar: ProgressBar):
xref, opt_jpg = result
if opt_jpg:
compdata = opt_jpg.read_bytes() # JPEG can inserted into PDF as is
@@ -552,7 +553,7 @@ def deflate_jpegs(pdf: Pdf, root: Path, options, executor: Executor) -> None:
for xref in jpegs:
yield pdf, lock, xref, complevel
def finish(result, pbar):
def finish(result: tuple[Xref, bytes], pbar: ProgressBar):
xref, compdata = result
if len(compdata) > 0:
with lock:

View File

@@ -38,6 +38,7 @@ from pikepdf import (
)
from ocrmypdf._concurrent import Executor, SerialExecutor
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.exceptions import EncryptedPdfError, InputFileError
from ocrmypdf.helpers import Resolution, available_cpu_count, pikepdf_enable_mmap
from ocrmypdf.pdfinfo.layout import LTStateAwareChar, get_page_analysis, get_text_boxes
@@ -694,13 +695,14 @@ def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel):
atexit.register(on_process_close)
def _pdf_pageinfo_sync(pageno, thread_pdf, infile, check_pages, detailed_analysis):
def _pdf_pageinfo_sync(
pageno, thread_pdf, infile, check_pages, detailed_analysis
) -> PageInfo:
pdf = thread_pdf if thread_pdf is not None else worker_pdf
with ExitStack() as stack:
if not pdf: # When called with SerialExecutor
pdf = stack.enter_context(Pdf.open(infile))
page = PageInfo(pdf, pageno, infile, check_pages, detailed_analysis)
return page
return PageInfo(pdf, pageno, infile, check_pages, detailed_analysis)
def _pdf_pageinfo_concurrent(
@@ -715,8 +717,7 @@ def _pdf_pageinfo_concurrent(
) -> Sequence[PageInfo | None]:
pages: Sequence[PageInfo | None] = [None] * len(pdf.pages)
def update_pageinfo(result, pbar):
page = result
def update_pageinfo(page: PageInfo, pbar: ProgressBar):
if not page:
raise InputFileError("Could read a page in the PDF")
pages[page.pageno] = page

View File

@@ -10,11 +10,12 @@ from argparse import ArgumentParser, Namespace
from collections.abc import Sequence, Set
from logging import Handler
from pathlib import Path
from typing import TYPE_CHECKING, NamedTuple, Protocol
from typing import TYPE_CHECKING, NamedTuple
import pluggy
from ocrmypdf import Executor, PdfContext
from ocrmypdf._progressbar import ProgressBar
from ocrmypdf.helpers import Resolution
if TYPE_CHECKING:
@@ -108,60 +109,6 @@ def check_options(options: Namespace) -> None:
"""
class ProgressBar(Protocol):
"""The protocol that OCRmyPDF expects progress bar classes to be compatible with.
In practice this could be used for any time of monitoring, not just a progress bar.
Calling the class should return a new progress bar object, which is activated
with ``__enter__`` and terminated with ``__exit__``. An update method is called
whenever the progress bar is updated. Progress bar objects will not be reused;
a new one will be created for each group of tasks.
The progress bar is held in the main process/thread and not updated by child
process/threads. When a child notifies the parent of completed work, the
parent updates the progress bar.
Progress bars should never write to ``sys.stdout``, or they will corrupt the
output if OCRmyPDF writes a PDF to standard output.
The type of events that OCRmyPDF reports to a progress bar may change in
minor releases.
"""
def __init__(
self,
*,
total: int | float | None,
desc: str | None,
unit: str | None,
disable: bool = False,
**kwargs,
):
"""Initialize a progress bar.
*total* indicates the total number of work units. If None, the total
number of work units is unknown. If *disable* is True, the progress bar
should be disabled. *unit* is a description of the work unit.
*desc* is a description of the overall task to be performed.
Unrecognized keyword arguments must be ignored, as the list of keyword
arguments may grow with time.
"""
def __enter__(self):
"""Enter a progress bar context."""
def __exit__(self, *args):
"""Exit a progress bar context."""
def update(self, n=1):
"""Update the progress bar by an increment.
For use within a progress bar context.
"""
@hookspec(firstresult=True)
def get_executor(progressbar_class: type[ProgressBar]) -> Executor:
"""Called to obtain an object that manages parallel execution.