mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
Reorganize progress bars so they can be typed properly
This commit is contained in:
@@ -177,6 +177,9 @@ Custom command line arguments
|
||||
Execution and progress reporting
|
||||
--------------------------------
|
||||
|
||||
.. autoclass:: ocrmypdf.pluginspec.ProgressBar
|
||||
:members:
|
||||
|
||||
.. autoclass:: ocrmypdf.pluginspec.Executor
|
||||
:members:
|
||||
|
||||
|
||||
@@ -8,29 +8,17 @@ from __future__ import annotations
|
||||
import threading
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from typing import Callable
|
||||
from typing import Callable, TypeVar
|
||||
|
||||
from ocrmypdf._progressbar import NullProgressBar, ProgressBar
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
def _task_noop(*_args, **_kwargs):
|
||||
return
|
||||
|
||||
|
||||
class NullProgressBar:
|
||||
"""Progress bar API that takes no actions."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
return False
|
||||
|
||||
def update(self, _arg=None):
|
||||
return
|
||||
|
||||
|
||||
class Executor(ABC):
|
||||
"""Abstract concurrent executor."""
|
||||
|
||||
@@ -48,9 +36,9 @@ class Executor(ABC):
|
||||
max_workers: int,
|
||||
progress_kwargs: dict,
|
||||
worker_initializer: Callable | None = None,
|
||||
task: Callable | None = None,
|
||||
task: Callable[..., T] | None = None,
|
||||
task_arguments: Iterable | None = None,
|
||||
task_finished: Callable | None = None,
|
||||
task_finished: Callable[[T, ProgressBar], None] | None = None,
|
||||
) -> None:
|
||||
"""Set up parallel execution and progress reporting.
|
||||
|
||||
|
||||
@@ -9,15 +9,6 @@ import logging
|
||||
|
||||
from rich.console import Console
|
||||
from rich.logging import RichHandler
|
||||
from rich.progress import (
|
||||
BarColumn,
|
||||
MofNCompleteColumn,
|
||||
Progress,
|
||||
TaskProgressColumn,
|
||||
TextColumn,
|
||||
TimeRemainingColumn,
|
||||
)
|
||||
from rich.table import Column
|
||||
|
||||
|
||||
class PageNumberFilter(logging.Filter):
|
||||
@@ -37,56 +28,3 @@ class RichLoggingHandler(RichHandler):
|
||||
super().__init__(
|
||||
console=console, show_level=False, show_time=False, markup=True, **kwargs
|
||||
)
|
||||
|
||||
|
||||
class RichProgressBar:
|
||||
"""Display progress bar using rich."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
console: Console,
|
||||
desc: str,
|
||||
total: float | None = None,
|
||||
unit: str | None = None,
|
||||
unit_scale: float | None = 1.0,
|
||||
disable: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.progress = Progress(
|
||||
TextColumn(
|
||||
"[progress.description]{task.description}",
|
||||
table_column=Column(min_width=20),
|
||||
),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeRemainingColumn(),
|
||||
console=console,
|
||||
auto_refresh=True,
|
||||
redirect_stderr=True,
|
||||
redirect_stdout=False,
|
||||
disable=disable,
|
||||
**kwargs,
|
||||
)
|
||||
self.unit_scale = unit_scale
|
||||
self.progress_bar = self.progress.add_task(
|
||||
desc,
|
||||
total=total * self.unit_scale
|
||||
if total is not None and self.unit_scale is not None
|
||||
else None,
|
||||
unit=unit,
|
||||
)
|
||||
|
||||
def __enter__(self):
|
||||
self.progress.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.progress.refresh()
|
||||
self.progress.stop()
|
||||
return False
|
||||
|
||||
def update(self, value=None):
|
||||
advance = self.unit_scale if value is None else value
|
||||
self.progress.update(self.progress_bar, advance=advance)
|
||||
|
||||
@@ -33,6 +33,7 @@ from ocrmypdf._pipelines._common import (
|
||||
worker_init,
|
||||
)
|
||||
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
|
||||
from ocrmypdf._progressbar import ProgressBar
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@@ -60,7 +61,7 @@ def exec_hocr_to_ocr_pdf(context: PdfContext, executor: Executor) -> Sequence[st
|
||||
|
||||
ocrgraft = OcrGrafter(context)
|
||||
|
||||
def graft_page(result: HOCRResult, pbar):
|
||||
def graft_page(result: HOCRResult, pbar: ProgressBar):
|
||||
"""After OCR is complete for a page, update the PDF."""
|
||||
try:
|
||||
set_thread_pageno(result.pageno + 1)
|
||||
|
||||
@@ -44,6 +44,7 @@ from ocrmypdf._pipelines._common import (
|
||||
worker_init,
|
||||
)
|
||||
from ocrmypdf._plugin_manager import OcrmypdfPluginManager
|
||||
from ocrmypdf._progressbar import ProgressBar
|
||||
from ocrmypdf._validation import (
|
||||
check_requested_output_file,
|
||||
create_input_file,
|
||||
@@ -100,7 +101,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]:
|
||||
sidecars: list[Path | None] = [None] * len(context.pdfinfo)
|
||||
ocrgraft = OcrGrafter(context)
|
||||
|
||||
def update_page(result: PageResult, pbar):
|
||||
def update_page(result: PageResult, pbar: ProgressBar):
|
||||
"""After OCR is complete for a page, update the PDF."""
|
||||
try:
|
||||
set_thread_pageno(result.pageno + 1)
|
||||
|
||||
135
src/ocrmypdf/_progressbar.py
Normal file
135
src/ocrmypdf/_progressbar.py
Normal file
@@ -0,0 +1,135 @@
|
||||
from typing import Protocol
|
||||
|
||||
from rich.console import Console
|
||||
from rich.progress import (
|
||||
BarColumn,
|
||||
MofNCompleteColumn,
|
||||
Progress,
|
||||
TaskProgressColumn,
|
||||
TextColumn,
|
||||
TimeRemainingColumn,
|
||||
)
|
||||
from rich.table import Column
|
||||
|
||||
|
||||
class ProgressBar(Protocol):
|
||||
"""The protocol that OCRmyPDF expects progress bar classes to be compatible with.
|
||||
|
||||
In practice this could be used for any time of monitoring, not just a progress bar.
|
||||
|
||||
Calling the class should return a new progress bar object, which is activated
|
||||
with ``__enter__`` and terminated with ``__exit__``. An update method is called
|
||||
whenever the progress bar is updated. Progress bar objects will not be reused;
|
||||
a new one will be created for each group of tasks.
|
||||
|
||||
The progress bar is held in the main process/thread and not updated by child
|
||||
process/threads. When a child notifies the parent of completed work, the
|
||||
parent updates the progress bar.
|
||||
|
||||
Progress bars should never write to ``sys.stdout``, or they will corrupt the
|
||||
output if OCRmyPDF writes a PDF to standard output.
|
||||
|
||||
The type of events that OCRmyPDF reports to a progress bar may change in
|
||||
minor releases.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
total: int | float | None,
|
||||
desc: str | None,
|
||||
unit: str | None,
|
||||
disable: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize a progress bar.
|
||||
|
||||
*total* indicates the total number of work units. If None, the total
|
||||
number of work units is unknown. If *disable* is True, the progress bar
|
||||
should be disabled. *unit* is a description of the work unit.
|
||||
*desc* is a description of the overall task to be performed.
|
||||
|
||||
Unrecognized keyword arguments must be ignored, as the list of keyword
|
||||
arguments may grow with time.
|
||||
"""
|
||||
|
||||
def __enter__(self):
|
||||
"""Enter a progress bar context."""
|
||||
|
||||
def __exit__(self, *args):
|
||||
"""Exit a progress bar context."""
|
||||
|
||||
def update(self, n=1):
|
||||
"""Update the progress bar by an increment.
|
||||
|
||||
For use within a progress bar context.
|
||||
"""
|
||||
|
||||
|
||||
class NullProgressBar:
|
||||
"""Progress bar API that takes no actions."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
return False
|
||||
|
||||
def update(self, _arg=None):
|
||||
return
|
||||
|
||||
|
||||
class RichProgressBar:
|
||||
"""Display progress bar using rich."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
console: Console,
|
||||
desc: str,
|
||||
total: float | None = None,
|
||||
unit: str | None = None,
|
||||
unit_scale: float | None = 1.0,
|
||||
disable: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.progress = Progress(
|
||||
TextColumn(
|
||||
"[progress.description]{task.description}",
|
||||
table_column=Column(min_width=20),
|
||||
),
|
||||
BarColumn(),
|
||||
TaskProgressColumn(),
|
||||
MofNCompleteColumn(),
|
||||
TimeRemainingColumn(),
|
||||
console=console,
|
||||
auto_refresh=True,
|
||||
redirect_stderr=True,
|
||||
redirect_stdout=False,
|
||||
disable=disable,
|
||||
**kwargs,
|
||||
)
|
||||
self.unit_scale = unit_scale
|
||||
self.progress_bar = self.progress.add_task(
|
||||
desc,
|
||||
total=total * self.unit_scale
|
||||
if total is not None and self.unit_scale is not None
|
||||
else None,
|
||||
unit=unit,
|
||||
)
|
||||
|
||||
def __enter__(self):
|
||||
self.progress.start()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
self.progress.refresh()
|
||||
self.progress.stop()
|
||||
return False
|
||||
|
||||
def update(self, value=None):
|
||||
advance = self.unit_scale if value is None else value
|
||||
self.progress.update(self.progress_bar, advance=advance)
|
||||
@@ -20,7 +20,8 @@ from typing import Callable, Union
|
||||
from rich.console import Console as RichConsole
|
||||
|
||||
from ocrmypdf import Executor, hookimpl
|
||||
from ocrmypdf._logging import RichLoggingHandler, RichProgressBar
|
||||
from ocrmypdf._logging import RichLoggingHandler
|
||||
from ocrmypdf._progressbar import RichProgressBar
|
||||
from ocrmypdf.exceptions import InputFileError
|
||||
from ocrmypdf.helpers import remove_all_log_handlers
|
||||
|
||||
|
||||
@@ -35,6 +35,7 @@ from PIL import Image
|
||||
from ocrmypdf._concurrent import Executor, SerialExecutor
|
||||
from ocrmypdf._exec import jbig2enc, pngquant
|
||||
from ocrmypdf._jobcontext import PdfContext
|
||||
from ocrmypdf._progressbar import ProgressBar
|
||||
from ocrmypdf.exceptions import OutputFileAccessError
|
||||
from ocrmypdf.helpers import IMG2PDF_KWARGS, safe_symlink
|
||||
|
||||
@@ -478,7 +479,7 @@ def transcode_jpegs(
|
||||
opt_jpg = in_jpg.with_suffix('.opt.jpg')
|
||||
yield xref, in_jpg, opt_jpg, options.jpeg_quality
|
||||
|
||||
def finish_jpeg(result: tuple[Xref, Path | None], pbar):
|
||||
def finish_jpeg(result: tuple[Xref, Path | None], pbar: ProgressBar):
|
||||
xref, opt_jpg = result
|
||||
if opt_jpg:
|
||||
compdata = opt_jpg.read_bytes() # JPEG can inserted into PDF as is
|
||||
@@ -552,7 +553,7 @@ def deflate_jpegs(pdf: Pdf, root: Path, options, executor: Executor) -> None:
|
||||
for xref in jpegs:
|
||||
yield pdf, lock, xref, complevel
|
||||
|
||||
def finish(result, pbar):
|
||||
def finish(result: tuple[Xref, bytes], pbar: ProgressBar):
|
||||
xref, compdata = result
|
||||
if len(compdata) > 0:
|
||||
with lock:
|
||||
|
||||
@@ -38,6 +38,7 @@ from pikepdf import (
|
||||
)
|
||||
|
||||
from ocrmypdf._concurrent import Executor, SerialExecutor
|
||||
from ocrmypdf._progressbar import ProgressBar
|
||||
from ocrmypdf.exceptions import EncryptedPdfError, InputFileError
|
||||
from ocrmypdf.helpers import Resolution, available_cpu_count, pikepdf_enable_mmap
|
||||
from ocrmypdf.pdfinfo.layout import LTStateAwareChar, get_page_analysis, get_text_boxes
|
||||
@@ -694,13 +695,14 @@ def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel):
|
||||
atexit.register(on_process_close)
|
||||
|
||||
|
||||
def _pdf_pageinfo_sync(pageno, thread_pdf, infile, check_pages, detailed_analysis):
|
||||
def _pdf_pageinfo_sync(
|
||||
pageno, thread_pdf, infile, check_pages, detailed_analysis
|
||||
) -> PageInfo:
|
||||
pdf = thread_pdf if thread_pdf is not None else worker_pdf
|
||||
with ExitStack() as stack:
|
||||
if not pdf: # When called with SerialExecutor
|
||||
pdf = stack.enter_context(Pdf.open(infile))
|
||||
page = PageInfo(pdf, pageno, infile, check_pages, detailed_analysis)
|
||||
return page
|
||||
return PageInfo(pdf, pageno, infile, check_pages, detailed_analysis)
|
||||
|
||||
|
||||
def _pdf_pageinfo_concurrent(
|
||||
@@ -715,8 +717,7 @@ def _pdf_pageinfo_concurrent(
|
||||
) -> Sequence[PageInfo | None]:
|
||||
pages: Sequence[PageInfo | None] = [None] * len(pdf.pages)
|
||||
|
||||
def update_pageinfo(result, pbar):
|
||||
page = result
|
||||
def update_pageinfo(page: PageInfo, pbar: ProgressBar):
|
||||
if not page:
|
||||
raise InputFileError("Could read a page in the PDF")
|
||||
pages[page.pageno] = page
|
||||
|
||||
@@ -10,11 +10,12 @@ from argparse import ArgumentParser, Namespace
|
||||
from collections.abc import Sequence, Set
|
||||
from logging import Handler
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, NamedTuple, Protocol
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
import pluggy
|
||||
|
||||
from ocrmypdf import Executor, PdfContext
|
||||
from ocrmypdf._progressbar import ProgressBar
|
||||
from ocrmypdf.helpers import Resolution
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -108,60 +109,6 @@ def check_options(options: Namespace) -> None:
|
||||
"""
|
||||
|
||||
|
||||
class ProgressBar(Protocol):
|
||||
"""The protocol that OCRmyPDF expects progress bar classes to be compatible with.
|
||||
|
||||
In practice this could be used for any time of monitoring, not just a progress bar.
|
||||
|
||||
Calling the class should return a new progress bar object, which is activated
|
||||
with ``__enter__`` and terminated with ``__exit__``. An update method is called
|
||||
whenever the progress bar is updated. Progress bar objects will not be reused;
|
||||
a new one will be created for each group of tasks.
|
||||
|
||||
The progress bar is held in the main process/thread and not updated by child
|
||||
process/threads. When a child notifies the parent of completed work, the
|
||||
parent updates the progress bar.
|
||||
|
||||
Progress bars should never write to ``sys.stdout``, or they will corrupt the
|
||||
output if OCRmyPDF writes a PDF to standard output.
|
||||
|
||||
The type of events that OCRmyPDF reports to a progress bar may change in
|
||||
minor releases.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
total: int | float | None,
|
||||
desc: str | None,
|
||||
unit: str | None,
|
||||
disable: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Initialize a progress bar.
|
||||
|
||||
*total* indicates the total number of work units. If None, the total
|
||||
number of work units is unknown. If *disable* is True, the progress bar
|
||||
should be disabled. *unit* is a description of the work unit.
|
||||
*desc* is a description of the overall task to be performed.
|
||||
|
||||
Unrecognized keyword arguments must be ignored, as the list of keyword
|
||||
arguments may grow with time.
|
||||
"""
|
||||
|
||||
def __enter__(self):
|
||||
"""Enter a progress bar context."""
|
||||
|
||||
def __exit__(self, *args):
|
||||
"""Exit a progress bar context."""
|
||||
|
||||
def update(self, n=1):
|
||||
"""Update the progress bar by an increment.
|
||||
|
||||
For use within a progress bar context.
|
||||
"""
|
||||
|
||||
|
||||
@hookspec(firstresult=True)
|
||||
def get_executor(progressbar_class: type[ProgressBar]) -> Executor:
|
||||
"""Called to obtain an object that manages parallel execution.
|
||||
|
||||
Reference in New Issue
Block a user