diff --git a/misc/batch.py b/misc/batch.py index d8a04dd0..3d0b0cb6 100644 --- a/misc/batch.py +++ b/misc/batch.py @@ -6,7 +6,6 @@ from __future__ import annotations # This script must be edited to meet your needs. import logging -import os import sys from pathlib import Path diff --git a/misc/example_plugin.py b/misc/example_plugin.py index b8f44fb8..f370f832 100644 --- a/misc/example_plugin.py +++ b/misc/example_plugin.py @@ -1,8 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R Barlow: https://github.com/jbarlow83 # SPDX-License-Identifier: MIT -""" -An example of an OCRmyPDF plugin. +"""An example of an OCRmyPDF plugin. This plugin adds two new command line arguments --grayscale-ocr: converts the image to grayscale before performing OCR on it diff --git a/misc/synology.py b/misc/synology.py index fe45b797..e7578974 100644 --- a/misc/synology.py +++ b/misc/synology.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2017 Enantiomerie # SPDX-License-Identifier: MIT -"""Example OCRmyPDF for Synology NAS""" +"""Example OCRmyPDF for Synology NAS.""" from __future__ import annotations diff --git a/misc/webservice.py b/misc/webservice.py index f21bd15b..4d126a44 100644 --- a/misc/webservice.py +++ b/misc/webservice.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: 2019 James R. Barlow # SPDX-License-Identifier: AGPL-3.0-or-later -"""This is a simple web service/HTTP wrapper for OCRmyPDF +"""This is a simple web service/HTTP wrapper for OCRmyPDF. This may be more convenient than the command line tool for some Docker users. Note that OCRmyPDF uses Ghostscript, which is licensed under AGPLv3+. While @@ -15,7 +15,7 @@ from __future__ import annotations import os import shlex -from subprocess import PIPE, run +from subprocess import run from tempfile import TemporaryDirectory from flask import Flask, Response, request, send_from_directory diff --git a/pyproject.toml b/pyproject.toml index 4ee6f0fe..e1f97ccb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -184,14 +184,19 @@ module = [ ] ignore_missing_imports = true -[tool.pylint.basic] -good-names = ["i", "j", "k", "ex", "Run", "_", "e", "p", "im", "w", "h", "m", "x", "y", "a", "b", "fp", "n", "f", "s", "v", "q", "dx", "dy"] -logging-format-style = "old" -disable = ["raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "logging-fstring-interpolation", "missing-function-docstring", "too-few-public-methods"] - [tool.ruff] -src = ["src"] -select = ["E"] +select = [ + "D", # pydocstyle + "E", # pycodestyle + "W", # pycodestyle + "F", # pyflakes + "I001", # isort + "UP", # pyupgrade +] +target-version = "py38" + +[tool.ruff.isort] +known-first-party = ["ocrmypdf"] [tool.ruff.pydocstyle] convention = "google" \ No newline at end of file diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index 24a64ae9..759307bc 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -11,7 +11,6 @@ import os import signal import sys from contextlib import suppress -from multiprocessing import set_start_method from ocrmypdf import __version__ from ocrmypdf._plugin_manager import get_parser_options_plugins diff --git a/src/ocrmypdf/_concurrent.py b/src/ocrmypdf/_concurrent.py index 47f089fe..51aa994b 100644 --- a/src/ocrmypdf/_concurrent.py +++ b/src/ocrmypdf/_concurrent.py @@ -51,8 +51,7 @@ class Executor(ABC): task_arguments: Iterable | None = None, task_finished: Callable | None = None, ) -> None: - """ - Set up parallel execution and progress reporting. + """Set up parallel execution and progress reporting. Args: use_threads: If ``False``, the workload is the sort that will benefit from @@ -73,7 +72,6 @@ class Executor(ABC): task. This runs in the parent's context, but the parameters must be marshallable to the worker. """ - if not task_arguments: return # Nothing to do! if not worker_initializer: diff --git a/src/ocrmypdf/_exec/__init__.py b/src/ocrmypdf/_exec/__init__.py index 15819b96..97d5365b 100644 --- a/src/ocrmypdf/_exec/__init__.py +++ b/src/ocrmypdf/_exec/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -"""Manage third party executables""" +"""Manage third party executables.""" from __future__ import annotations diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py index ac5f28fa..5a488a3a 100644 --- a/src/ocrmypdf/_exec/ghostscript.py +++ b/src/ocrmypdf/_exec/ghostscript.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -"""Interface to Ghostscript executable""" +"""Interface to Ghostscript executable.""" from __future__ import annotations diff --git a/src/ocrmypdf/_exec/jbig2enc.py b/src/ocrmypdf/_exec/jbig2enc.py index 0f8f7392..9efba57f 100644 --- a/src/ocrmypdf/_exec/jbig2enc.py +++ b/src/ocrmypdf/_exec/jbig2enc.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -"""Interface to jbig2 executable""" +"""Interface to jbig2 executable.""" from __future__ import annotations diff --git a/src/ocrmypdf/_exec/pngquant.py b/src/ocrmypdf/_exec/pngquant.py index 64e91139..f399b2f1 100644 --- a/src/ocrmypdf/_exec/pngquant.py +++ b/src/ocrmypdf/_exec/pngquant.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -"""Interface to pngquant executable""" +"""Interface to pngquant executable.""" from __future__ import annotations diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py index 99a0715a..cafd66a0 100644 --- a/src/ocrmypdf/_exec/tesseract.py +++ b/src/ocrmypdf/_exec/tesseract.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -"""Interface to Tesseract executable""" +"""Interface to Tesseract executable.""" from __future__ import annotations @@ -54,7 +54,7 @@ TESSERACT_THRESHOLDING_METHODS: dict[str, int] = { class TesseractLoggerAdapter(logging.LoggerAdapter): - "Prepend [tesseract] to messages emitted from tesseract" + "Prepend [tesseract] to messages emitted from tesseract." def process(self, msg, kwargs): kwargs['extra'] = self.extra @@ -283,7 +283,8 @@ def page_timedout(timeout: float) -> None: def _generate_null_hocr(output_hocr: Path, output_text: Path, image: Path) -> None: """Produce a .hocr file that reports no text detected on a page that is - the same size as the input image.""" + the same size as the input image. + """ with Image.open(image) as im: w, h = im.size diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py index dd3a5116..9734f846 100644 --- a/src/ocrmypdf/_exec/unpaper.py +++ b/src/ocrmypdf/_exec/unpaper.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -"""Interface to unpaper executable""" +"""Interface to unpaper executable.""" from __future__ import annotations diff --git a/src/ocrmypdf/_graft.py b/src/ocrmypdf/_graft.py index 73a9cc92..500606d5 100644 --- a/src/ocrmypdf/_graft.py +++ b/src/ocrmypdf/_graft.py @@ -37,7 +37,6 @@ def _update_resources(*, obj, font, font_key, procset): obj can be a page or Form XObject. """ - resources = _ensure_dictionary(obj, Name.Resources) fonts = _ensure_dictionary(resources, Name.Font) if font_key is not None and font_key not in fonts: @@ -167,7 +166,6 @@ class OcrGrafter: the font to page 1 even if page 1 doesn't use it, so we have a way to get it back. """ - page0 = self.pdf_base.pages[0] _update_resources( obj=page0, font=self.font, font_key=self.font_key, procset=self.procset @@ -200,8 +198,7 @@ class OcrGrafter: return self.output_file def _find_font(self, text): - """Copy a font from the filename text into pdf_base""" - + """Copy a font from the filename text into pdf_base.""" font, font_key = None, None possible_font_names = ('/f-0-0', '/F1') try: @@ -234,8 +231,7 @@ class OcrGrafter: text_rotation: int, strip_old_text: bool, ): - """Insert the text layer from text page 0 on to pdf_base at page_num""" - + """Insert the text layer from text page 0 on to pdf_base at page_num.""" # pylint: disable=invalid-name log.debug("Grafting") diff --git a/src/ocrmypdf/_logging.py b/src/ocrmypdf/_logging.py index 533132bf..ef939d71 100644 --- a/src/ocrmypdf/_logging.py +++ b/src/ocrmypdf/_logging.py @@ -24,7 +24,7 @@ class PageNumberFilter(logging.Filter): class TqdmConsole: - """Wrapper to log messages in a way that is compatible with tqdm progress bar + """Wrapper to log messages in a way that is compatible with tqdm progress bar. This routes log messages through tqdm so that it can print them above the progress bar, and then refresh the progress bar, rather than overwriting diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index f284f399..121c4c49 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -121,7 +121,6 @@ def _pdf_guess_version(input_file: Path, search_window=1024) -> str: Returns empty string if not found, indicating file is probably not PDF. """ - with open(input_file, 'rb') as f: signature = f.read(search_window) m = re.search(br'%PDF-(\d\.\d)', signature) @@ -222,7 +221,7 @@ def _vector_page_dpi(pageinfo: PageInfo) -> int: def get_page_dpi(pageinfo: PageInfo, options) -> Resolution: - "Get the DPI when nonsquare DPI is tolerable" + "Get the DPI when nonsquare DPI is tolerable." xres = max( pageinfo.dpi.x or VECTOR_PAGE_DPI, options.oversample or 0.0, @@ -237,7 +236,7 @@ def get_page_dpi(pageinfo: PageInfo, options) -> Resolution: def get_page_square_dpi(pageinfo: PageInfo, options) -> Resolution: - "Get the DPI when we require xres == yres, scaled to physical units" + "Get the DPI when we require xres == yres, scaled to physical units." xres = pageinfo.dpi.x or 0.0 yres = pageinfo.dpi.y or 0.0 userunit = float(pageinfo.userunit) or 1.0 @@ -253,7 +252,7 @@ def get_page_square_dpi(pageinfo: PageInfo, options) -> Resolution: def get_canvas_square_dpi(pageinfo: PageInfo, options) -> Resolution: - """Get the DPI when we require xres == yres, in Postscript units""" + """Get the DPI when we require xres == yres, in Postscript units.""" units = float( max( (pageinfo.dpi.x) or VECTOR_PAGE_DPI, @@ -358,9 +357,7 @@ def rasterize_preview(input_file: Path, page_context: PageContext) -> Path: def describe_rotation(page_context: PageContext, orient_conf, correction: int) -> str: - """ - Describe the page rotation we are going to perform. - """ + """Describe the page rotation we are going to perform.""" direction = {0: '⇧', 90: '⇨', 180: '⇩', 270: '⇦'} turns = {0: ' ', 90: '⬏', 180: '↻', 270: '⬑'} @@ -401,7 +398,6 @@ def get_orientation_correction(preview: Path, page_context: PageContext) -> int: which points it (hopefully) upright. _graft.py takes care of the orienting the image and text layers. """ - orient_conf = page_context.plugin_manager.hook.get_ocr_engine().get_orientation( preview, page_context.options ) @@ -514,10 +510,11 @@ def preprocess_clean(input_file: Path, page_context: PageContext) -> Path: def create_ocr_image(image: Path, page_context: PageContext) -> Path: - """Create the image we send for OCR. May not be the same as the display - image depending on preprocessing. This image will never be shown to the - user.""" + """Create the image we send for OCR. + Might not be the same as the display image depending on preprocessing. + This image will never be shown to the user. + """ output_file = page_context.get_path('ocr.png') options = page_context.options with Image.open(image) as im: diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 8521d1c6..54cb0e08 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -251,8 +251,7 @@ def worker_init(max_pixels: int) -> None: def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]: - """Execute the pipeline concurrently""" - + """Execute the pipeline concurrently.""" # Run exec_page_sync on every page context options = context.options max_workers = min(len(context.pdfinfo), options.jobs) @@ -316,8 +315,7 @@ def exec_concurrent(context: PdfContext, executor: Executor) -> Sequence[str]: def configure_debug_logging( log_filename: Path, prefix: str = '' ) -> logging.FileHandler: - """ - Create a debug log file at a specified location. + """Create a debug log file at a specified location. Arguments: log_filename: Where to the put the log file. diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py index 52821869..f0bd7175 100644 --- a/src/ocrmypdf/api.py +++ b/src/ocrmypdf/api.py @@ -86,7 +86,6 @@ def configure_logging( Returns: The toplevel logger for ocrmypdf (or the root logger, if we are managing it). """ - prefix = '' if manage_root_logger else 'ocrmypdf' log = logging.getLogger(prefix) @@ -277,6 +276,8 @@ def ocr( # pylint: disable=unused-argument When a stream is used as output, whether via a writable object or ``"-"``, some final validation steps are not performed (we do not read back the stream after it is written). + + Raises: ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. diff --git a/src/ocrmypdf/builtin_plugins/concurrency.py b/src/ocrmypdf/builtin_plugins/concurrency.py index 3c53b3e7..f5e69f0e 100644 --- a/src/ocrmypdf/builtin_plugins/concurrency.py +++ b/src/ocrmypdf/builtin_plugins/concurrency.py @@ -30,7 +30,7 @@ WorkerInit = Callable[[Queue, UserInit, int], None] def log_listener(q: Queue): - """Listen to the worker processes and forward the messages to logging + """Listen to the worker processes and forward the messages to logging. For simplicity this is a thread rather than a process. Only one process should actually write to sys.stderr or whatever we're using, so if this is @@ -39,7 +39,6 @@ def log_listener(q: Queue): See: https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes """ - while True: try: record = q.get() @@ -59,8 +58,7 @@ def process_sigbus(*args): def process_init(q: Queue, user_init: UserInit, loglevel) -> None: - """Initialize a process pool worker""" - + """Initialize a process pool worker.""" # Ignore SIGINT (our parent process will kill us gracefully) signal.signal(signal.SIGINT, signal.SIG_IGN) diff --git a/src/ocrmypdf/builtin_plugins/ghostscript.py b/src/ocrmypdf/builtin_plugins/ghostscript.py index 7ab8d7c9..60a3d52b 100644 --- a/src/ocrmypdf/builtin_plugins/ghostscript.py +++ b/src/ocrmypdf/builtin_plugins/ghostscript.py @@ -8,7 +8,6 @@ import logging from ocrmypdf import hookimpl from ocrmypdf._exec import ghostscript -from ocrmypdf._validation import HOCR_OK_LANGS from ocrmypdf.exceptions import MissingDependencyError from ocrmypdf.subprocess import check_external_program diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py index d8e5181f..360dac51 100644 --- a/src/ocrmypdf/cli.py +++ b/src/ocrmypdf/cli.py @@ -15,7 +15,7 @@ T = TypeVar('T', int, float) def numeric(basetype: Callable[[Any], T], min_: T | None = None, max_: T | None = None): - """Validator for numeric params""" + """Validator for numeric params.""" min_ = basetype(min_) if min_ is not None else None max_ = basetype(max_) if max_ is not None else None @@ -46,7 +46,7 @@ def str_to_int(mapping: Mapping[str, int]): class ArgumentParser(argparse.ArgumentParser): - """Override parser's default behavior of calling sys.exit() + """Override parser's default behavior of calling sys.exit(). https://stackoverflow.com/questions/5943249/python-argparse-and-controlling-overriding-the-exit-status-code diff --git a/src/ocrmypdf/extra_plugins/semfree.py b/src/ocrmypdf/extra_plugins/semfree.py index 43e81654..89ff3a0c 100644 --- a/src/ocrmypdf/extra_plugins/semfree.py +++ b/src/ocrmypdf/extra_plugins/semfree.py @@ -73,8 +73,7 @@ class ConnectionLogHandler(logging.handlers.QueueHandler): def process_loop( conn: Connection, user_init: Callable[[], None], loglevel, task, task_args ): - """Initialize a process pool worker""" - + """Initialize a process pool worker.""" # Install SIGBUS handler (so our parent process can abort somewhat gracefully) with suppress(AttributeError): # Windows and Cygwin do not have SIGBUS # Windows and Cygwin do not have pthread_sigmask or SIGBUS diff --git a/src/ocrmypdf/helpers.py b/src/ocrmypdf/helpers.py index 8b038bac..965cc973 100644 --- a/src/ocrmypdf/helpers.py +++ b/src/ocrmypdf/helpers.py @@ -108,7 +108,7 @@ class Resolution(Generic[T]): class NeverRaise(Exception): - """An exception that is never raised""" + """An exception that is never raised.""" def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike): @@ -170,7 +170,7 @@ def monotonic(seq: Sequence) -> bool: def page_number(input_file: os.PathLike) -> int: - """Get one-based page number implied by filename (000002.pdf -> 2)""" + """Get one-based page number implied by filename (000002.pdf -> 2).""" return int(os.path.basename(os.fspath(input_file))[0:6]) diff --git a/src/ocrmypdf/hocrtransform.py b/src/ocrmypdf/hocrtransform.py index 306dfa24..28e33c01 100755 --- a/src/ocrmypdf/hocrtransform.py +++ b/src/ocrmypdf/hocrtransform.py @@ -99,11 +99,9 @@ class HocrTransformError(Exception): class HocrTransform: - - """ - A class for converting documents from the hOCR format. + """A class for converting documents from the hOCR format. For details of the hOCR format, see: - http://kba.cloud/hocr-spec/ + http://kba.cloud/hocr-spec/. """ box_pattern = re.compile(r'bbox((\s+\d+){4})') @@ -143,9 +141,7 @@ class HocrTransform: raise HocrTransformError("hocr file is missing page dimensions") def __str__(self): # pragma: no cover - """ - Return the textual content of the HTML body - """ + """Return the textual content of the HTML body.""" if self.hocr is None: return '' body = self.hocr.find(self._child_xpath('body')) @@ -155,9 +151,7 @@ class HocrTransform: return '' def _get_element_text(self, element: Element): - """ - Return the textual content of the element and its children - """ + """Return the textual content of the element and its children.""" text = '' if element.text is not None: text += element.text @@ -169,10 +163,7 @@ class HocrTransform: @classmethod def element_coordinates(cls, element: Element) -> Rect: - """ - Returns a tuple containing the coordinates of the bounding box around - an element - """ + """Get coordinates of the bounding box around an element.""" out = Rect._make(0 for _ in range(4)) if 'title' in element.attrib: matches = cls.box_pattern.search(element.attrib['title']) @@ -183,9 +174,7 @@ class HocrTransform: @classmethod def baseline(cls, element: Element) -> tuple[float, float]: - """ - Returns a tuple containing the baseline slope and intercept. - """ + """Get baseline's slope and intercept.""" if 'title' in element.attrib: matches = cls.baseline_pattern.search(element.attrib['title']) if matches: @@ -193,9 +182,7 @@ class HocrTransform: return (0.0, 0.0) def pt_from_pixel(self, pxl) -> Rect: - """ - Returns the quantity in PDF units (pt) given quantity in pixels - """ + """Returns the quantity in PDF units (pt) given quantity in pixels.""" return Rect._make((c / self.dpi * inch) for c in pxl) def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str: @@ -206,11 +193,7 @@ class HocrTransform: @classmethod def replace_unsupported_chars(cls, s: str) -> str: - """ - Given an input string, returns the corresponding string that: - * is available in the Helvetica facetype - * does not contain any ligature (to allow easy search in the PDF file) - """ + """Replaces characters with those available in the Helvetica typeface.""" return s.translate(cls.ligatures) def topdown_position(self, element): @@ -231,8 +214,8 @@ class HocrTransform: invisible_text: bool = False, interword_spaces: bool = False, ) -> None: - """ - Creates a PDF file with an image superimposed on top of the text. + """Creates a PDF file with an image superimposed on top of the text. + Text is positioned according to the bounding box of the lines in the hOCR file. The image need not be identical to the image used to create the hOCR diff --git a/src/ocrmypdf/optimize.py b/src/ocrmypdf/optimize.py index 3f94bd62..264f6277 100644 --- a/src/ocrmypdf/optimize.py +++ b/src/ocrmypdf/optimize.py @@ -230,7 +230,7 @@ def extract_images( options, extract_fn: Callable[..., XrefExt | None], ) -> Iterator[tuple[int, XrefExt]]: - """Extract image using extract_fn + """Extract image using extract_fn. Enumerate images on each page, lookup their xref/ID number in the PDF. Exclude images that are soft masks (i.e. alpha transparency related). @@ -244,7 +244,6 @@ def extract_images( it does a tuple should be returned: (xref, ext) where .ext is the file extension. extract_fn must also extract the file it finds interesting. """ - include_xrefs: MutableSet[Xref] = set() exclude_xrefs: MutableSet[Xref] = set() pageno_for_xref = {} @@ -289,8 +288,7 @@ def extract_images( def extract_images_generic( pike: Pdf, root: Path, options ) -> tuple[list[Xref], list[Xref]]: - """Extract any >=2bpp image we think we can improve""" - + """Extract any >=2bpp image we think we can improve.""" jpegs = [] pngs = [] for _, xref_ext in extract_images(pike, root, options, extract_image_generic): @@ -304,8 +302,7 @@ def extract_images_generic( def extract_images_jbig2(pike: Pdf, root: Path, options) -> dict[int, list[XrefExt]]: - """Extract any bitonal image that we think we can improve as JBIG2""" - + """Extract any bitonal image that we think we can improve as JBIG2.""" jbig2_groups = defaultdict(list) for pageno, xref_ext in extract_images(pike, root, options, extract_image_jbig2): group = pageno // options.jbig2_page_group_size @@ -318,7 +315,7 @@ def extract_images_jbig2(pike: Pdf, root: Path, options) -> dict[int, list[XrefE def _produce_jbig2_images( jbig2_groups: dict[int, list[XrefExt]], root: Path, options, executor: Executor ) -> None: - """Produce JBIG2 images from their groups""" + """Produce JBIG2 images from their groups.""" def jbig2_group_args(root: Path, groups: dict[int, list[XrefExt]]): for group, xref_exts in groups.items(): @@ -674,7 +671,7 @@ def main(infile, outfile, level, jobs=1): from tempfile import TemporaryDirectory # pylint: disable=import-outside-toplevel class OptimizeOptions: - """Emulate ocrmypdf's options""" + """Emulate ocrmypdf's options.""" def __init__( self, input_file, jobs, optimize_, jpeg_quality, png_quality, jb2lossy diff --git a/src/ocrmypdf/pdfa.py b/src/ocrmypdf/pdfa.py index 58637f4a..199c02f1 100644 --- a/src/ocrmypdf/pdfa.py +++ b/src/ocrmypdf/pdfa.py @@ -1,9 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -""" -Utilities for PDF/A production and confirmation with Ghostspcript. -""" +"""Utilities for PDF/A production and confirmation with Ghostspcript.""" from __future__ import annotations @@ -75,7 +73,7 @@ def _make_postscript(icc_name: str, icc_data: bytes, colors: int) -> Iterator[st def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'): - """Create a Postscript PDFMARK file for Ghostscript PDF/A conversion + """Create a Postscript PDFMARK file for Ghostscript PDF/A conversion. pdfmark is an extension to the Postscript language that describes some PDF features like bookmarks and annotations. It was originally specified Adobe @@ -118,7 +116,6 @@ def file_claims_pdfa(filename: Path): This only checks if the XMP metadata contains a PDF/A marker. It does not do full PDF/A validation. """ - with pikepdf.open(filename) as pdf: pdfmeta = pdf.open_metadata() if not pdfmeta.pdfa_status: diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index 76602e6f..5e1632fb 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -164,7 +164,7 @@ class TextMarker: def _normalize_stack(graphobjs): - """Convert runs of qQ's in the stack into single graphobjs""" + """Convert runs of qQ's in the stack into single graphobjs.""" for operands, operator in graphobjs: operator = str(operator) if re.match(r'Q*q+$', operator): # Zero or more Q, one or more q @@ -200,7 +200,6 @@ def _interpret_contents(contentstream: Object, initial_shorthand=UNIT_SQUARE): undefined in the spec, but we just pretend nothing happened and leave the CTM unchanged. """ - stack = [] ctm = PdfMatrix(initial_shorthand) xobject_settings: list[XobjectSettings] = [] @@ -307,7 +306,6 @@ def _get_dpi(ctm_shorthand, image_size) -> Resolution: /MediaBox. """ - a, b, c, d, _, _ = ctm_shorthand # pylint: disable=invalid-name # Calculate the width and height of the image in PDF units @@ -451,8 +449,7 @@ class ImageInfo: def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]: - "Find inline images in the contentstream" - + "Find inline images in the contentstream." for n, inline in enumerate(contentsinfo.inline_images): yield ImageInfo( name=f'inline-{n:02d}', shorthand=inline.shorthand, inline=inline.iimage @@ -460,7 +457,7 @@ def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]: def _image_xobjects(container) -> Iterator[tuple[Object, str]]: - """Search for all XObject-based images in the container + """Search for all XObject-based images in the container. Usually the container is a page, but it could also be a Form XObject that contains images. Filter out the Form XObjects which are dealt with @@ -471,7 +468,6 @@ def _image_xobjects(container) -> Iterator[tuple[Object, str]]: since the object does not know its own name. """ - if '/Resources' not in container: return resources = container['/Resources'] @@ -488,14 +484,13 @@ def _image_xobjects(container) -> Iterator[tuple[Object, str]]: def _find_regular_images( container: Object, contentsinfo: ContentsInfo ) -> Iterator[ImageInfo]: - """Find images stored in the container's /Resources /XObject + """Find images stored in the container's /Resources /XObject. Usually the container is a page, but it could also be a Form XObject that contains images. Generates images with their DPI at time of drawing. """ - for pdfimage, xobj in _image_xobjects(container): if xobj not in contentsinfo.name_index: continue @@ -512,7 +507,7 @@ def _find_regular_images( def _find_form_xobject_images(pdf: Pdf, container: Object, contentsinfo: ContentsInfo): - """Find any images that are in Form XObjects in the container + """Find any images that are in Form XObjects in the container. The container may be a page, or a parent Form XObject. @@ -546,7 +541,7 @@ def _find_form_xobject_images(pdf: Pdf, container: Object, contentsinfo: Content def _process_content_streams( *, pdf: Pdf, container: Object, shorthand=None ) -> Iterator[VectorMarker | TextMarker | ImageInfo]: - """Find all individual instances of images drawn in the container + """Find all individual instances of images drawn in the container. Usually the container is a page, but it may also be a Form XObject. @@ -563,7 +558,6 @@ def _process_content_streams( downsampling. """ - if container.get('/Type') == '/Page' and '/Contents' in container: initial_shorthand = shorthand or UNIT_SQUARE elif container.get('/Type') == '/XObject' and container['/Subtype'] == '/Form': @@ -595,8 +589,7 @@ def _process_content_streams( def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> bool: - """Smarter text detection that ignores text in margins""" - + """Smarter text detection that ignores text in margins.""" pw, ph = float(page_width), float(page_height) # pylint: disable=invalid-name margin_ratio = 0.125 @@ -608,10 +601,9 @@ def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> ) def rects_intersect(a: FloatRect, b: FloatRect) -> bool: - """ - Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3) + """Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3) https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other - Formula assumes all boxes are in first quadrant + Formula assumes all boxes are in first quadrant. """ return a[0] < b[2] and a[2] > b[0] and a[1] > b[3] and a[3] < b[1] @@ -624,7 +616,7 @@ def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> def simplify_textboxes(miner, textbox_getter) -> Iterator[TextboxInfo]: - """Extract only limited content from text boxes + """Extract only limited content from text boxes. We do this to save memory and ensure that our objects are pickleable. """ @@ -910,7 +902,7 @@ DEFAULT_EXECUTOR = SerialExecutor() class PdfInfo: - """Get summary information about a PDF""" + """Get summary information about a PDF.""" def __init__( self, diff --git a/src/ocrmypdf/pdfinfo/layout.py b/src/ocrmypdf/pdfinfo/layout.py index d586fbfe..b240a32c 100644 --- a/src/ocrmypdf/pdfinfo/layout.py +++ b/src/ocrmypdf/pdfinfo/layout.py @@ -63,7 +63,7 @@ def pdftype3font__pscript5_get_ascent(self): class LTStateAwareChar(LTChar): - """A subclass of LTChar that tracks text render mode at time of drawing""" + """A subclass of LTChar that tracks text render mode at time of drawing.""" __slots__ = ( 'rendermode', @@ -111,7 +111,7 @@ class LTStateAwareChar(LTChar): self.rendermode = textstate.render def is_compatible(self, obj): - """Check if characters can be combined into a textline + """Check if characters can be combined into a textline. We consider characters compatible if: - the Unicode mapping is known, and both have the same render mode @@ -146,7 +146,7 @@ class LTStateAwareChar(LTChar): class TextPositionTracker(PDFLayoutAnalyzer): - """A page layout analyzer that pays attention to text visibility""" + """A page layout analyzer that pays attention to text visibility.""" def __init__(self, rsrcmgr, pageno=1, laparams=None): super().__init__(rsrcmgr, pageno, laparams) diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index afb46194..acbbc07a 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -99,6 +99,8 @@ def check_options(options: Namespace) -> None: ocrmypdf.exceptions.ExitCodeException: If options are not acceptable and the application should terminate gracefully with an informative message and error code. + + Note: This hook will be called from the main process, and may modify global state before child worker processes are forked. @@ -127,6 +129,8 @@ def get_executor(progressbar_class) -> Executor: Note: This hook will be called from the main process, and may modify global state before child worker processes are forked. + + Note: This is a :ref:`firstresult hook`. """ @@ -159,7 +163,6 @@ def get_progressbar_class(): Here is how OCRmyPDF will use the progress bar: Example: - pbar_class = pm.hook.get_progressbar_class() with pbar_class(**tqdm_kwargs) as pbar: ... @@ -181,6 +184,8 @@ def validate(pdfinfo: PdfInfo, options: Namespace) -> None: ocrmypdf.exceptions.ExitCodeException: If options or pdfinfo are not acceptable and the application should terminate gracefully with an informative message and error code. + + Note: This hook will be called from the main process, and may modify global state before child worker processes are forked. @@ -218,6 +223,8 @@ def rasterize_pdf_page( Note: This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. + + Note: This is a :ref:`firstresult hook`. """ @@ -245,6 +252,8 @@ def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image: Note: This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. + + Note: This is a :ref:`firstresult hook`. """ @@ -281,6 +290,8 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path: Note: This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. + + Note: This is a :ref:`firstresult hook`. """ @@ -323,6 +334,8 @@ def filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) - Note: This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. + + Note: This is a :ref:`firstresult hook`. """ @@ -381,7 +394,8 @@ class OcrEngine(ABC): """Returns the set of all languages that are supported by the engine. Languages are typically given in 3-letter ISO 3166-1 codes, but actually - can be any value understood by the OCR engine.""" + can be any value understood by the OCR engine. + """ @staticmethod @abstractmethod @@ -474,7 +488,7 @@ def generate_pdfa( Note: This is a :ref:`firstresult hook`. - See also: + See Also: https://github.com/tqdm/tqdm """ diff --git a/src/ocrmypdf/quality.py b/src/ocrmypdf/quality.py index 8cc48249..0ceb718d 100644 --- a/src/ocrmypdf/quality.py +++ b/src/ocrmypdf/quality.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -"""Utilities to measure OCR quality""" +"""Utilities to measure OCR quality.""" from __future__ import annotations diff --git a/src/ocrmypdf/subprocess/__init__.py b/src/ocrmypdf/subprocess/__init__.py index 9b09f391..58e51ba2 100644 --- a/src/ocrmypdf/subprocess/__init__.py +++ b/src/ocrmypdf/subprocess/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -"""Wrappers to manage subprocess calls""" +"""Wrappers to manage subprocess calls.""" from __future__ import annotations @@ -34,7 +34,7 @@ def run( check: bool = False, **kwargs, ) -> CompletedProcess: - """Wrapper around :py:func:`subprocess.run` + """Wrapper around :py:func:`subprocess.run`. The main purpose of this wrapper is to log subprocess output in an orderly fashion that identifies the responsible subprocess. An additional @@ -141,7 +141,7 @@ def get_version( regex=r'(\d+(\.\d+)*)', env: OsEnviron | None = None, ) -> str: - """Get the version of the specified program + """Get the version of the specified program. Arguments: program: The program to version check. @@ -323,7 +323,6 @@ def check_external_program( version_parser: A class that should be used to parse and compare version numbers. Used when version numbers do not follow standard conventions. """ - try: found_version = version_checker() except (CalledProcessError, FileNotFoundError) as e: diff --git a/src/ocrmypdf/subprocess/_windows.py b/src/ocrmypdf/subprocess/_windows.py index 46f68931..b199427d 100644 --- a/src/ocrmypdf/subprocess/_windows.py +++ b/src/ocrmypdf/subprocess/_windows.py @@ -169,8 +169,7 @@ SHIMS = [ def fix_windows_args(program: str, args, env): - """Adjust our desired program and command line arguments for use on Windows""" - + """Adjust our desired program and command line arguments for use on Windows.""" # If we are running a .py on Windows, ensure we call it with this Python # (to support test suite shims) if program.lower().endswith('.py'): diff --git a/tests/conftest.py b/tests/conftest.py index d3b86061..893cfb34 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,12 +3,10 @@ from __future__ import annotations -import os import platform import sys from pathlib import Path -from subprocess import PIPE, CompletedProcess, run -from typing import List +from subprocess import CompletedProcess, run import pytest @@ -71,10 +69,13 @@ def outtxt(tmp_path) -> Path: @pytest.fixture(scope="function") def no_outpdf(tmp_path) -> Path: - """This just documents the fact that a test is not expected to produce + """Document fact that a test is not expected to produce output. + + This just documents the fact that a test is not expected to produce output. Unfortunately an assertion failure inside a test fixture produces an error rather than a test failure, so no testing is done. It's up to - the test to confirm that no output file was created.""" + the test to confirm that no output file was created. + """ return tmp_path / 'no_output.pdf' @@ -110,7 +111,6 @@ def run_ocrmypdf_api(input_file: Path, output_file: Path, *args) -> ExitCode: The return code must always be checked or the test may declare a failure to be pass. """ - api_args = [str(input_file), str(output_file)] + [ str(arg) for arg in args if arg is not None ] @@ -128,7 +128,6 @@ def run_ocrmypdf( If an exception is thrown this fact will be returned as part of the result text and return code rather than exception objects. """ - p_args = ( [sys.executable, '-m', 'ocrmypdf'] + [str(arg) for arg in args if arg is not None] diff --git a/tests/plugins/tesseract_badutf8.py b/tests/plugins/tesseract_badutf8.py index 93f14f06..9e1db8e4 100644 --- a/tests/plugins/tesseract_badutf8.py +++ b/tests/plugins/tesseract_badutf8.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MIT -"""Tesseract bad utf8 +"""Tesseract bad utf8. In some cases, some versions of Tesseract can output binary gibberish or data that is not UTF-8 compatible, so we are forced to check that we can convert it diff --git a/tests/plugins/tesseract_cache.py b/tests/plugins/tesseract_cache.py index eec8f4c9..368b8bb0 100644 --- a/tests/plugins/tesseract_cache.py +++ b/tests/plugins/tesseract_cache.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MIT -"""Cache output of tesseract to speed up test suite +"""Cache output of tesseract to speed up test suite. The cache is keyed by by the input test file The input arguments are slugged into a hideous filename that more or less represents them literally. Joined diff --git a/tests/plugins/tesseract_debug_rotate.py b/tests/plugins/tesseract_debug_rotate.py index 65655e2b..305ced83 100644 --- a/tests/plugins/tesseract_debug_rotate.py +++ b/tests/plugins/tesseract_debug_rotate.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MIT -"""Tesseract no-op/fixed rotate plugin +"""Tesseract no-op/fixed rotate plugin. To quickly run tests where getting OCR output is not necessary and we want to test the rotation pipeline. diff --git a/tests/plugins/tesseract_noop.py b/tests/plugins/tesseract_noop.py index 5bb66053..92ac8500 100644 --- a/tests/plugins/tesseract_noop.py +++ b/tests/plugins/tesseract_noop.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MIT -"""Tesseract no-op plugin +"""Tesseract no-op plugin. To quickly run tests where getting OCR output is not necessary. diff --git a/tests/test_check_pdf.py b/tests/test_check_pdf.py index 3127d1f2..816746f9 100644 --- a/tests/test_check_pdf.py +++ b/tests/test_check_pdf.py @@ -3,8 +3,6 @@ from __future__ import annotations -import pytest - from ocrmypdf.helpers import check_pdf diff --git a/tests/test_completion.py b/tests/test_completion.py index 74106473..aff017fd 100644 --- a/tests/test_completion.py +++ b/tests/test_completion.py @@ -4,7 +4,7 @@ from __future__ import annotations import os -from subprocess import PIPE, run +from subprocess import run import pytest diff --git a/tests/test_graft.py b/tests/test_graft.py index 8e73622f..5507aff1 100644 --- a/tests/test_graft.py +++ b/tests/test_graft.py @@ -6,7 +6,6 @@ from __future__ import annotations from unittest.mock import patch import pikepdf -import pytest import ocrmypdf diff --git a/tests/test_logging.py b/tests/test_logging.py index 7e03b081..896d1810 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -5,8 +5,6 @@ from __future__ import annotations import logging -import pytest - from ocrmypdf._sync import configure_debug_logging diff --git a/tests/test_main.py b/tests/test_main.py index 682f1a63..06038809 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -446,7 +446,7 @@ def test_linearized_pdf_and_indirect_object(resources, outpdf): def test_very_high_dpi(resources, outpdf): - "Checks for a Decimal quantize error with high DPI, etc" + "Checks for a Decimal quantize error with high DPI, etc." check_ocrmypdf( resources / '2400dpi.pdf', outpdf, diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 80b97c78..1adb60fd 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -231,17 +231,6 @@ def test_xml_metadata_preserved( 'pdf:keywords', ] acquired_properties = ['dc:format'] - might_change_properties = [ - 'dc:date', - 'pdf:pdfversion', - 'pdf:Producer', - 'xmp:CreateDate', - 'xmp:ModifyDate', - 'xmp:MetadataDate', - 'xmp:CreatorTool', - 'xmpMM:DocumentId', - 'xmpMM:DnstanceId', - ] # Cleanup messy data structure # Top level is key-value mapping of namespaces to keys under namespace, diff --git a/tests/test_quality.py b/tests/test_quality.py index af7acff6..56852727 100644 --- a/tests/test_quality.py +++ b/tests/test_quality.py @@ -3,8 +3,6 @@ from __future__ import annotations -import pytest - from ocrmypdf import quality as qual diff --git a/tests/test_rotation.py b/tests/test_rotation.py index 22ff1bde..ced1839e 100644 --- a/tests/test_rotation.py +++ b/tests/test_rotation.py @@ -11,12 +11,13 @@ from os import fspath import img2pdf import pikepdf import pytest +from PIL import Image, ImageChops +from reportlab.pdfgen.canvas import Canvas + from ocrmypdf._exec import ghostscript from ocrmypdf._plugin_manager import get_plugin_manager from ocrmypdf.helpers import IMG2PDF_KWARGS, Resolution from ocrmypdf.pdfinfo import PdfInfo -from PIL import Image, ImageChops -from reportlab.pdfgen.canvas import Canvas from .conftest import check_ocrmypdf, run_ocrmypdf @@ -152,13 +153,14 @@ def test_autorotate_threshold(threshold, op, comparison_threshold, resources, ou def test_rotated_skew_timeout(resources, outpdf): - """This document contains an image that is rotated 90 into place with a + """Check rotated skew timeout. + + This document contains an image that is rotated 90 into place with a /Rotate tag and intentionally skewed by altering the transformation matrix. This tests for a bug where the combination of preprocessing and a tesseract timeout produced a page whose dimensions did not match the original's. """ - input_file = resources / 'rotated_skew.pdf' in_pageinfo = PdfInfo(input_file)[0] diff --git a/tests/test_stdio.py b/tests/test_stdio.py index 577ec607..6ff07bc7 100644 --- a/tests/test_stdio.py +++ b/tests/test_stdio.py @@ -4,13 +4,10 @@ from __future__ import annotations import os -import sys -from pathlib import Path -from subprocess import DEVNULL, PIPE, Popen, run +from subprocess import DEVNULL, PIPE, run import pytest -from ocrmypdf.exceptions import ExitCode from ocrmypdf.helpers import check_pdf from .conftest import run_ocrmypdf diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py index ad6df549..7009698c 100644 --- a/tests/test_unpaper.py +++ b/tests/test_unpaper.py @@ -8,14 +8,13 @@ from os import fspath from unittest.mock import patch import pytest -from PIL import Image from ocrmypdf._exec import unpaper from ocrmypdf._plugin_manager import get_parser_options_plugins from ocrmypdf._validation import check_options from ocrmypdf.exceptions import ExitCode, MissingDependencyError -from .conftest import check_ocrmypdf, have_unpaper, ocrmypdf_exec, run_ocrmypdf +from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf # pylint: disable=redefined-outer-name