diff --git a/pyproject.toml b/pyproject.toml index 0722fe2b..a933a4fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -99,3 +99,8 @@ module = [ 'libxmp.utils' ] ignore_missing_imports = true + +[tool.pylint.basic] +good-names = ["i", "j", "k", "ex", "Run", "_", "e", "p", "im", "w", "h", "m", "x", "y", "a", "b", "fp", "n", "f", "s", "v", "q", "dx", "dy"] +logging-format-style = "old" +disable = ["raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "logging-fstring-interpolation", "missing-function-docstring", "too-few-public-methods"] diff --git a/setup.py b/setup.py index d9e7048e..772b4bfa 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""setup.py to support older setuptools and pip.""" from setuptools import setup diff --git a/src/ocrmypdf/__init__.py b/src/ocrmypdf/__init__.py index 5e566389..a8e5911e 100644 --- a/src/ocrmypdf/__init__.py +++ b/src/ocrmypdf/__init__.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Adds OCR layer to PDFs.""" from pluggy import HookimplMarker as _HookimplMarker diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index c5948e95..ce4612b6 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -5,6 +5,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""ocrmypdf command line entrypoint.""" import logging import os diff --git a/src/ocrmypdf/_concurrent.py b/src/ocrmypdf/_concurrent.py index af505eb3..5918d2cb 100644 --- a/src/ocrmypdf/_concurrent.py +++ b/src/ocrmypdf/_concurrent.py @@ -4,6 +4,8 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""OCRmyPDF concurrency abstractions.""" + import threading from abc import ABC, abstractmethod from typing import Callable, Iterable, Optional @@ -14,6 +16,8 @@ def _task_noop(*_args, **_kwargs): class NullProgressBar: + """Progress bar API that takes no actions.""" + def __init__(self, **kwargs): pass @@ -28,6 +32,8 @@ class NullProgressBar: class Executor(ABC): + """Abstract concurrent executor.""" + pool_lock = threading.Lock() pbar_class = NullProgressBar diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py index 111ccf42..4da65483 100644 --- a/src/ocrmypdf/_exec/ghostscript.py +++ b/src/ocrmypdf/_exec/ghostscript.py @@ -14,13 +14,12 @@ import sys from io import BytesIO from os import fspath from pathlib import Path -from shutil import which from subprocess import PIPE, CalledProcessError from typing import Optional from PIL import Image, UnidentifiedImageError -from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError +from ocrmypdf.exceptions import SubprocessOutputError from ocrmypdf.helpers import Resolution from ocrmypdf.subprocess import get_version, run, run_polling_stderr @@ -33,29 +32,18 @@ except AttributeError: log = logging.getLogger(__name__) -missing_gs_error = """ ---------------------------------------------------------------------- -This error normally occurs when ocrmypdf find can't Ghostscript. -Please ensure Ghostscript is installed and its location is added to -the system PATH environment variable. - -For details see: - https://ocrmypdf.readthedocs.io/en/latest/installation.html ---------------------------------------------------------------------- -""" - # Most reliable what to get the bitness of Python interpreter, according to Python docs -_is_64bit = sys.maxsize > 2**32 +_IS_64BIT = sys.maxsize > 2**32 -_gswin = None +_GSWIN = None if os.name == 'nt': - if _is_64bit: - _gswin = 'gswin64c' + if _IS_64BIT: + _GSWIN = 'gswin64c' else: - _gswin = 'gswin32c' + _GSWIN = 'gswin32c' -GS = _gswin if _gswin else 'gs' -del _gswin +GS = _GSWIN if _GSWIN else 'gs' +del _GSWIN def version(): @@ -126,7 +114,7 @@ def rasterize_pdf( p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True) except CalledProcessError as e: log.error(e.stderr.decode(errors='replace')) - raise SubprocessOutputError('Ghostscript rasterizing failed') + raise SubprocessOutputError('Ghostscript rasterizing failed') from e else: stderr = p.stderr.decode(errors='replace') if _gs_error_reported(stderr): @@ -156,6 +144,8 @@ def rasterize_pdf( class GhostscriptFollower: + """Parses the output of Ghostscript and uses it to update the progress bar.""" + re_process = re.compile(r"Processing pages \d+ through (\d+).") re_page = re.compile(r"Page (\d+)") diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py index 0e032026..01177cac 100644 --- a/src/ocrmypdf/_exec/tesseract.py +++ b/src/ocrmypdf/_exec/tesseract.py @@ -13,7 +13,7 @@ from math import pi from os import fspath from pathlib import Path from subprocess import PIPE, STDOUT, CalledProcessError, TimeoutExpired -from typing import Dict, Iterator, List, Optional +from typing import Dict, List, Optional from packaging.version import Version from PIL import Image @@ -55,6 +55,8 @@ TESSERACT_THRESHOLDING_METHODS: Dict[str, int] = { class TesseractLoggerAdapter(logging.LoggerAdapter): + "Prepend [tesseract] to messages emitted from tesseract" + def process(self, msg, kwargs): kwargs['extra'] = self.extra return f'[tesseract] {msg}', kwargs @@ -105,6 +107,7 @@ TESSERACT_VERSION_PATTERN = r""" class TesseractVersion(Version): + "Modify standard packaging.Version regex to support Tesseract idiosyncracies." _regex = re.compile( r"^\s*" + TESSERACT_VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE ) @@ -169,14 +172,14 @@ def tess_base_args(langs: List[str], engine_mode: Optional[int]) -> List[str]: def _parse_tesseract_output(binary_output: bytes) -> Dict[str, str]: - def g(): + def gen(): for line in binary_output.decode().splitlines(): line = line.strip() parts = line.split(':', maxsplit=2) if len(parts) == 2: yield parts[0].strip(), parts[1].strip() - return {k: v for k, v in g()} + return dict(gen()) def get_orientation( @@ -205,10 +208,10 @@ def get_orientation( osd = _parse_tesseract_output(p.stdout) angle = int(osd.get('Orientation in degrees', 0)) - oc = OrientationConfidence( + orient_conf = OrientationConfidence( angle=angle, confidence=float(osd.get('Orientation confidence', 0)) ) - return oc + return orient_conf def get_deskew( diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py index 9c2c8821..479959ef 100644 --- a/src/ocrmypdf/_exec/unpaper.py +++ b/src/ocrmypdf/_exec/unpaper.py @@ -30,12 +30,16 @@ if sys.version_info >= (3, 10): else: from tempfile import TemporaryDirectory as _TemporaryDirectory - # Consume the ignore_cleanup_errors kwarg in Python 3.9 and older, without acting - # on this keyword. Users who need this issue full resolved should upgrade to Python - # 3.10. - # See: https://github.com/python/cpython/pull/24793 - class TemporaryDirectory(_TemporaryDirectory): + """Shim to consume ignore_cleanup_errors kwarg on Python 3.9 and older. + + The argument is consumed without action. If users are getting errors related + to temporary file cleanup, they should upgrade to Python 3.10 which properly + cleans up temporary directories on Windows. + + See: https://github.com/python/cpython/pull/24793 + """ + def __init__(self, ignore_cleanup_errors=False, **kwargs): super().__init__(**kwargs) @@ -50,6 +54,8 @@ log = logging.getLogger(__name__) class UnpaperImageTooLargeError(Exception): + """To capture details when an image is too large for unpaper.""" + def __init__( self, w, @@ -66,8 +72,10 @@ def version() -> str: return get_version('unpaper') +SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} + + def _convert_image(im: Image.Image) -> Tuple[Image.Image, bool, str]: - SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} im_modified = False if im.mode not in SUFFIXES: diff --git a/src/ocrmypdf/_graft.py b/src/ocrmypdf/_graft.py index 29369280..8c9b3c32 100644 --- a/src/ocrmypdf/_graft.py +++ b/src/ocrmypdf/_graft.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""For grafting text-only PDF pages onto freeform PDF pages.""" import logging import uuid @@ -16,7 +17,6 @@ from pikepdf import ( Name, Object, Operator, - Page, Pdf, PdfError, PdfMatrix, @@ -81,6 +81,8 @@ def strip_invisible_text(pdf, page): class OcrGrafter: + """Manages grafting text-only PDFs onto regular PDFs.""" + def __init__(self, context): self.context = context self.path_base = context.origin @@ -236,6 +238,8 @@ class OcrGrafter: ): """Insert the text layer from text page 0 on to pdf_base at page_num""" + # pylint: disable=invalid-name + log.debug("Grafting") if Path(textpdf).stat().st_size == 0: return diff --git a/src/ocrmypdf/_jobcontext.py b/src/ocrmypdf/_jobcontext.py index 11f04cf2..36d51309 100644 --- a/src/ocrmypdf/_jobcontext.py +++ b/src/ocrmypdf/_jobcontext.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Defines context objects that are passed to child processes/threads.""" import os import shutil diff --git a/src/ocrmypdf/_logging.py b/src/ocrmypdf/_logging.py index 1aac857f..e1767b5c 100644 --- a/src/ocrmypdf/_logging.py +++ b/src/ocrmypdf/_logging.py @@ -4,15 +4,17 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Logging support classes.""" import logging -import sys from contextlib import suppress from tqdm import tqdm class PageNumberFilter(logging.Filter): + """Insert PDF page number that emitted log message to log record.""" + def filter(self, record): pageno = getattr(record, 'pageno', None) if isinstance(pageno, int): diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index f0c690ca..70ab7660 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""OCRmyPDF page processing pipeline functions.""" import logging import os @@ -332,7 +333,8 @@ def is_ocr_required(page_context: PageContext): ocr_required = False log.warning( "page too big, skipping OCR " - f"({(pixel_count / 1_000_000):.1f} MPixels > {options.skip_big:.1f} MPixels --skip-big)" + f"({(pixel_count / 1_000_000):.1f} MPixels > " + f"{options.skip_big:.1f} MPixels --skip-big)" ) return ocr_required @@ -430,8 +432,8 @@ def rasterize( output_file = page_context.get_path(f'rasterize{output_tag}.png') pageinfo = page_context.pageinfo - def at_least(cs): - return max(device_idx, colorspaces.index(cs)) + def at_least(colorspace): + return max(device_idx, colorspaces.index(colorspace)) for image in pageinfo.images: if image.type_ != 'image': @@ -471,10 +473,10 @@ def rasterize( def preprocess_remove_background(input_file: Path, page_context: PageContext): if any(image.bpc > 1 for image in page_context.pageinfo.images): - output_file = page_context.get_path('pp_rm_bg.png') - # leptonica.remove_background(input_file, output_file) raise NotImplementedError("--remove-background is temporarily not implemented") - return output_file + # output_file = page_context.get_path('pp_rm_bg.png') + # leptonica.remove_background(input_file, output_file) + # return output_file else: log.info("background removal skipped on mono page") return input_file @@ -858,8 +860,8 @@ def enumerate_compress_ranges(iterable): def merge_sidecars(txt_files: Iterable[Optional[Path]], context: PdfContext): output_file = context.get_path('sidecar.txt') with open(output_file, 'w', encoding="utf-8") as stream: - for (frm, to), txt_file in enumerate_compress_ranges(txt_files): - if frm != 1: + for (from_, to_), txt_file in enumerate_compress_ranges(txt_files): + if from_ != 1: stream.write('\f') # Form feed between pages if txt_file: with open(txt_file, encoding="utf-8") as in_: @@ -872,10 +874,10 @@ def merge_sidecars(txt_files: Iterable[Optional[Path]], context: PdfContext): else: stream.write(txt) else: - if frm != to: - pages = f'{frm}-{to}' + if from_ != to_: + pages = f'{from_}-{to_}' else: - pages = f'{frm}' + pages = f'{from_}' stream.write(f'[OCR skipped on page(s) {pages}]') return output_file diff --git a/src/ocrmypdf/_plugin_manager.py b/src/ocrmypdf/_plugin_manager.py index 3e28048d..516adbaf 100644 --- a/src/ocrmypdf/_plugin_manager.py +++ b/src/ocrmypdf/_plugin_manager.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Plugin manager using pluggy.""" import argparse import importlib @@ -101,12 +102,11 @@ class OcrmypdfPluginManager(pluggy.PluginManager): def get_plugin_manager(plugins: List[Union[str, Path]], builtins=True): - pm = OcrmypdfPluginManager( + return OcrmypdfPluginManager( project_name='ocrmypdf', plugins=plugins, builtins=builtins, ) - return pm def get_parser_options_plugins( diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 7958674c..93213434 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -4,6 +4,8 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Implements the concurrent and page synchronous parts of the pipeline.""" + import argparse import logging @@ -68,7 +70,9 @@ from ocrmypdf.pdfa import file_claims_pdfa log = logging.getLogger(__name__) -class PageResult(NamedTuple): # pylint: disable=inherit-non-class +class PageResult(NamedTuple): + """Result when a page is finished processing.""" + pageno: int pdf_page_from_image: Optional[Path] ocr: Optional[Path] @@ -425,7 +429,7 @@ def run_pipeline( else: log.error(type(e).__name__) return e.exit_code - except (PIL.Image.DecompressionBombError if not api else NeverRaise) as e: + except (PIL.Image.DecompressionBombError if not api else NeverRaise): log.exception( "A decompression bomb error was encountered while executing the " "pipeline. Use the argument --max-image-mpixels to raise the maximum " @@ -435,7 +439,7 @@ def run_pipeline( except ( BrokenProcessPool if not api else NeverRaise, BrokenThreadPool if not api else NeverRaise, - ) as e: + ): log.exception( "A worker process was terminated unexpectedly. This is known to occur if " "processing your file takes all available swap space and RAM. It may " diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py index 613e1c9f..3ddda46c 100644 --- a/src/ocrmypdf/_validation.py +++ b/src/ocrmypdf/_validation.py @@ -5,6 +5,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Validate a work order from API or command line.""" import locale import logging @@ -25,7 +26,7 @@ from ocrmypdf.exceptions import ( MissingDependencyError, OutputFileAccessError, ) -from ocrmypdf.helpers import is_file_writable, monotonic, safe_symlink, samefile +from ocrmypdf.helpers import is_file_writable, monotonic, safe_symlink from ocrmypdf.hocrtransform import HOCR_OK_LANGS from ocrmypdf.subprocess import check_external_program @@ -146,13 +147,13 @@ def check_options_preprocessing(options): def _pages_from_ranges(ranges: str) -> Set[int]: pages: List[int] = [] page_groups = ranges.replace(' ', '').split(',') - for g in page_groups: - if not g: + for group in page_groups: + if not group: continue try: - start, end = g.split('-') + start, end = group.split('-') except ValueError: - pages.append(int(g) - 1) + pages.append(int(group) - 1) else: try: new_pages = list(range(int(start) - 1, int(end))) @@ -162,7 +163,7 @@ def _pages_from_ranges(ranges: str) -> Set[int]: ) from None pages.extend(new_pages) except ValueError: - raise BadArgsError(f"invalid page subrange '{g}'") from None + raise BadArgsError(f"invalid page subrange '{group}'") from None if not pages: raise BadArgsError( @@ -237,13 +238,13 @@ def check_options_advanced(options): def check_options_metadata(options): docinfo = [options.title, options.author, options.keywords, options.subject] for s in (m for m in docinfo if m): - for c in s: - if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000: + for char in s: + if unicodedata.category(char) == 'Co' or ord(char) >= 0x10000: + hexchar = hex(ord(char))[2:].upper() raise ValueError( "One of the metadata strings contains " - "an unsupported Unicode character: '{}' (U+{})".format( - c, hex(ord(c))[2:].upper() - ) + "an unsupported Unicode character: " + f"{char} (U+{hexchar})" ) @@ -293,7 +294,7 @@ def create_input_file(options, work_folder: Path) -> Tuple[Path, str]: target = work_folder / 'origin' safe_symlink(options.input_file, target) return target, os.fspath(options.input_file) - except FileNotFoundError: + except FileNotFoundError as e: msg = f"File not found - {options.input_file}" if Path('/.dockerenv').exists(): # pragma: no cover msg += ( @@ -304,7 +305,7 @@ def create_input_file(options, work_folder: Path) -> Tuple[Path, str]: "\n" "\tdocker run -i --rm jbarlow83/ocrmypdf - - output.pdf\n" ) - raise InputFileError(msg) + raise InputFileError(msg) from e def check_requested_output_file(options): @@ -324,7 +325,9 @@ def check_requested_output_file(options): ) -def report_output_file_size(options, input_file, output_file): +def report_output_file_size( + options, input_file, output_file, file_overhead=4000, page_overhead=3000 +): try: output_size = Path(output_file).stat().st_size input_size = Path(input_file).stat().st_size @@ -333,9 +336,7 @@ def report_output_file_size(options, input_file, output_file): with pikepdf.open(output_file) as p: # Overhead constants obtained by estimating amount of data added by OCR # PDF/A conversion, and possible XMP metadata addition, with compression - FILE_OVERHEAD = 4000 - OCR_PER_PAGE_OVERHEAD = 3000 - reasonable_overhead = FILE_OVERHEAD + OCR_PER_PAGE_OVERHEAD * len(p.pages) + reasonable_overhead = file_overhead + page_overhead * len(p.pages) ratio = output_size / input_size reasonable_ratio = output_size / (input_size + reasonable_overhead) if reasonable_ratio < 1.35 or input_size < 25000: diff --git a/src/ocrmypdf/_version.py b/src/ocrmypdf/_version.py index 3cf5ff8b..2925dbb4 100644 --- a/src/ocrmypdf/_version.py +++ b/src/ocrmypdf/_version.py @@ -4,6 +4,10 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Get version by introspecting package information. + +OCRmyPDF uses setuptools_scm to derive version from git tags. +""" try: from importlib_metadata import version as _package_version diff --git a/src/ocrmypdf/builtin_plugins/concurrency.py b/src/ocrmypdf/builtin_plugins/concurrency.py index f5d92bcf..feb718f8 100644 --- a/src/ocrmypdf/builtin_plugins/concurrency.py +++ b/src/ocrmypdf/builtin_plugins/concurrency.py @@ -11,6 +11,8 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""OCRmyPDF's multiprocessing/multithreading abstraction layer.""" + import logging import logging.handlers import multiprocessing @@ -21,7 +23,6 @@ import sys import threading from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed from contextlib import suppress -from multiprocessing.pool import Pool, ThreadPool from typing import Callable, Iterable, Type, Union from tqdm import tqdm @@ -44,7 +45,8 @@ def log_listener(q: Queue): should actually write to sys.stderr or whatever we're using, so if this is made into a process the main application needs to be directed to it. - See https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes + See: + https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes """ while True: @@ -89,6 +91,8 @@ def process_init(q: Queue, user_init: UserInit, loglevel) -> None: def thread_init(q: Queue, user_init: UserInit, loglevel) -> None: + del q # unused but required argument + del loglevel # unused but required argument # As a thread, block SIGBUS so the main thread deals with it... with suppress(AttributeError): signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGBUS}) @@ -98,6 +102,8 @@ def thread_init(q: Queue, user_init: UserInit, loglevel) -> None: class StandardExecutor(Executor): + """Standard OCRmyPDF concurrent task executor.""" + def _execute( self, *, diff --git a/src/ocrmypdf/builtin_plugins/default_filters.py b/src/ocrmypdf/builtin_plugins/default_filters.py index da3f3aae..cd985a05 100644 --- a/src/ocrmypdf/builtin_plugins/default_filters.py +++ b/src/ocrmypdf/builtin_plugins/default_filters.py @@ -4,6 +4,8 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""OCRmyPDF automatically installs these filters as plugins.""" + from ocrmypdf import hookimpl diff --git a/src/ocrmypdf/builtin_plugins/ghostscript.py b/src/ocrmypdf/builtin_plugins/ghostscript.py index c16e5714..3d6fbbc7 100644 --- a/src/ocrmypdf/builtin_plugins/ghostscript.py +++ b/src/ocrmypdf/builtin_plugins/ghostscript.py @@ -5,6 +5,8 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Built-in plugin to implement PDF page rasterization and PDF/A production.""" + import logging from ocrmypdf import hookimpl diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py index a0de9841..4bc9a58b 100644 --- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py +++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py @@ -4,6 +4,8 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Built-in plugin to implement OCR using Tesseract.""" + import logging import os @@ -138,6 +140,8 @@ def validate(pdfinfo, options): class TesseractOcrEngine(OcrEngine): + """Implements OCR with Tesseract.""" + @staticmethod def version(): return tesseract.version() diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py index 17f54e56..01bcb841 100644 --- a/src/ocrmypdf/cli.py +++ b/src/ocrmypdf/cli.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Command line interface customization and validation.""" import argparse from typing import Any, Callable, Mapping, Optional, TypeVar @@ -42,7 +43,7 @@ def str_to_int(mapping: Mapping[str, int]): except KeyError: raise argparse.ArgumentTypeError( f"{s!r} must be one of: {', '.join(mapping.keys())}" - ) + ) from None return _str_to_int @@ -51,6 +52,11 @@ class ArgumentParser(argparse.ArgumentParser): """Override parser's default behavior of calling sys.exit() https://stackoverflow.com/questions/5943249/python-argparse-and-controlling-overriding-the-exit-status-code + + OCRmyPDF began as a CLI but eventually acquired an API. The API works inside out, + by synthesizing a command line argument. So we subclass the standard parser with + one that doesn't call sys.exit(). Obviously this is not the ideal way to do things + but it works for us. """ def __init__(self, *args, **kwargs): @@ -65,6 +71,8 @@ class ArgumentParser(argparse.ArgumentParser): class LanguageSetAction(argparse.Action): + """Manages a list of languages.""" + def __init__(self, option_strings, dest, default=None, **kwargs): if default is None: default = set() diff --git a/src/ocrmypdf/exceptions.py b/src/ocrmypdf/exceptions.py index 5228b241..f4e8680f 100644 --- a/src/ocrmypdf/exceptions.py +++ b/src/ocrmypdf/exceptions.py @@ -4,12 +4,16 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""OCRmyPDF's exceptions.""" from enum import IntEnum from textwrap import dedent class ExitCode(IntEnum): + """OCRmyPDF's exit codes.""" + + # pylint: disable=invalid-name ok = 0 bad_args = 1 input_file = 2 @@ -26,6 +30,8 @@ class ExitCode(IntEnum): class ExitCodeException(Exception): + """An exception which should return an exit code with sys.exit().""" + exit_code = ExitCode.other_error message = "" @@ -37,17 +43,24 @@ class ExitCodeException(Exception): class BadArgsError(ExitCodeException): + """Invalid arguments on the command line or API.""" + exit_code = ExitCode.bad_args -class PdfMergeFailedError(ExitCodeException): +class PdfMergeFailedError(ExitCodeException): # deprecated + """An intermediate PDF can't be merged. + + No longer in use. + """ + exit_code = ExitCode.input_file message = dedent( '''\ Failed to merge PDF image layer with OCR layer Usually this happens because the input PDF file is malformed and - ocrmypdf cannot automatically correct the problem on its own. + ocrmypdf cannot correct the problem on its own. Try using ocrmypdf --pdf-renderer sandwich [..other args..] @@ -56,34 +69,50 @@ class PdfMergeFailedError(ExitCodeException): class MissingDependencyError(ExitCodeException): + """A third-party dependency is missing.""" + exit_code = ExitCode.missing_dependency class UnsupportedImageFormatError(ExitCodeException): + """The image format is not supported.""" + exit_code = ExitCode.input_file class DpiError(ExitCodeException): + """Missing information about input image DPI.""" + exit_code = ExitCode.input_file class OutputFileAccessError(ExitCodeException): + """Cannot access the intended output file path.""" + exit_code = ExitCode.file_access_error class PriorOcrFoundError(ExitCodeException): + """This file already has OCR.""" + exit_code = ExitCode.already_done_ocr class InputFileError(ExitCodeException): + """Something is wrong with the input file.""" + exit_code = ExitCode.input_file class SubprocessOutputError(ExitCodeException): + """A subprocess returned an unexpected error.""" + exit_code = ExitCode.child_process_error class EncryptedPdfError(ExitCodeException): + """Input PDF is encrypted.""" + exit_code = ExitCode.encrypted_pdf message = dedent( '''\ @@ -100,5 +129,7 @@ class EncryptedPdfError(ExitCodeException): class TesseractConfigError(ExitCodeException): + """Tesseract config can't be parsed.""" + exit_code = ExitCode.invalid_config message = "Error occurred while parsing a Tesseract configuration file" diff --git a/src/ocrmypdf/extra_plugins/semfree.py b/src/ocrmypdf/extra_plugins/semfree.py index 186206d9..f7ecc081 100644 --- a/src/ocrmypdf/extra_plugins/semfree.py +++ b/src/ocrmypdf/extra_plugins/semfree.py @@ -37,9 +37,11 @@ from ocrmypdf.helpers import remove_all_log_handlers class MessageType(Enum): - exception = auto() - result = auto() - complete = auto() + """Implement basic IPC messaging.""" + + exception = auto() # pylint: disable=invalid-name + result = auto() # pylint: disable=invalid-name + complete = auto() # pylint: disable=invalid-name def split_every(n: int, iterable: Iterable) -> Iterator: @@ -59,6 +61,8 @@ def process_sigbus(*args): class ConnectionLogHandler(logging.handlers.QueueHandler): + """Handler used by child processes to forward log messages to parent.""" + def __init__(self, conn: Connection) -> None: # sets the parent's queue to None - parent only touches queue # in enqueue() which we override @@ -91,7 +95,7 @@ def process_loop( for args in task_args: try: result = task(args) - except Exception as e: + except Exception as e: # pylint: disable=broad-except conn.send((MessageType.exception, e)) break else: @@ -103,6 +107,8 @@ def process_loop( class LambdaExecutor(Executor): + """Executor for AWS Lambda or similar environments that lack semaphores.""" + def _execute( self, *, @@ -153,13 +159,13 @@ class LambdaExecutor(Executor): with self.pbar_class(**tqdm_kwargs) as pbar: while connections: - for r in wait(connections): - if not isinstance(r, Connection): + for result in wait(connections): + if not isinstance(result, Connection): raise NotImplementedError("We only support Connection()") try: - msg_type, msg = r.recv() + msg_type, msg = result.recv() except EOFError: - connections.remove(r) + connections.remove(result) continue if msg_type == MessageType.result: @@ -170,7 +176,7 @@ class LambdaExecutor(Executor): logger = logging.getLogger(record.name) logger.handle(record) elif msg_type == MessageType.complete: - connections.remove(r) + connections.remove(result) elif msg_type == MessageType.exception: for process in processes: process.terminate() diff --git a/src/ocrmypdf/helpers.py b/src/ocrmypdf/helpers.py index 2169e64a..6e8c224b 100644 --- a/src/ocrmypdf/helpers.py +++ b/src/ocrmypdf/helpers.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Support functions.""" import logging import multiprocessing @@ -137,11 +138,11 @@ def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike): os.symlink(os.path.abspath(input_file), soft_link_name) -def samefile(f1: os.PathLike, f2: os.PathLike): +def samefile(file1: os.PathLike, file2: os.PathLike): if os.name == 'nt': - return f1 == f2 + return file1 == file2 else: - return os.path.samefile(f1, f2) + return os.path.samefile(file1, file2) def is_iterable_notstr(thing: Any) -> bool: @@ -149,9 +150,9 @@ def is_iterable_notstr(thing: Any) -> bool: return isinstance(thing, Iterable) and not isinstance(thing, str) -def monotonic(L: Sequence) -> bool: +def monotonic(seq: Sequence) -> bool: """Does this sequence increase monotonically?""" - return all(b > a for a, b in zip(L, L[1:])) + return all(b > a for a, b in zip(seq, seq[1:])) def page_number(input_file: os.PathLike) -> int: @@ -166,7 +167,7 @@ def available_cpu_count() -> int: except NotImplementedError: pass warnings.warn( - "Could not get CPU count. Assuming one (1) CPU." "Use -j N to set manually." + "Could not get CPU count. Assuming one (1) CPU. Use -j N to set manually." ) return 1 @@ -190,16 +191,16 @@ def is_file_writable(test_file: os.PathLike) -> bool: os.W_OK, effective_ids=(os.access in os.supports_effective_ids), ) + + try: + fp = p.open('wb') + except OSError: + return False else: - try: - fp = p.open('wb') - except OSError: - return False - else: - fp.close() - with suppress(OSError): - p.unlink() - return True + fp.close() + with suppress(OSError): + p.unlink() + return True except (OSError, RuntimeError) as e: log.debug(e) log.error(str(e)) diff --git a/src/ocrmypdf/hocrtransform.py b/src/ocrmypdf/hocrtransform.py index 90895b4b..d2bae3bc 100755 --- a/src/ocrmypdf/hocrtransform.py +++ b/src/ocrmypdf/hocrtransform.py @@ -28,6 +28,8 @@ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +"""Transform .hocr and page image to text PDF.""" + import argparse import os import re @@ -99,7 +101,7 @@ HOCR_OK_LANGS = frozenset( Element = ElementTree.Element -class Rect(NamedTuple): # pylint: disable=inherit-non-class +class Rect(NamedTuple): """A rectangle for managing PDF coordinates.""" x1: Any @@ -109,7 +111,7 @@ class Rect(NamedTuple): # pylint: disable=inherit-non-class class HocrTransformError(Exception): - pass + """Error while applying hOCR transform.""" class HocrTransform: @@ -287,7 +289,7 @@ class HocrTransform: continue pxl_coords = self.element_coordinates(elem) - pt = self.pt_from_pixel(pxl_coords) + pt = self.pt_from_pixel(pxl_coords) # pylint: disable=invalid-name # draw the bbox border if show_bounding_boxes: # pragma: no cover diff --git a/src/ocrmypdf/optimize.py b/src/ocrmypdf/optimize.py index 10cc002d..1b23af8b 100644 --- a/src/ocrmypdf/optimize.py +++ b/src/ocrmypdf/optimize.py @@ -4,6 +4,8 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Post-processing image optimization of OCR PDFs.""" + import logging import sys @@ -55,7 +57,9 @@ DEFAULT_PNG_QUALITY = 70 Xref = NewType('Xref', int) -class XrefExt(NamedTuple): # pylint: disable=inherit-non-class +class XrefExt(NamedTuple): + """A PDF xref and image extension pair.""" + xref: Xref ext: str @@ -466,7 +470,7 @@ def _find_deflatable_jpeg( result = extract_image_filter(pike, root, image, xref) if result is None: return None - pim, filtdp = result + _pim, filtdp = result if filtdp[0] == Name.DCTDecode and not filtdp[1] and options.optimize >= 1: return XrefExt(xref, '.memory') @@ -707,9 +711,9 @@ def main(infile, outfile, level, jobs=1): jb2lossy=False, ) - with TemporaryDirectory() as td: - context = PdfContext(options, td, infile, None, None) - tmpout = Path(td) / 'out.pdf' + with TemporaryDirectory() as tmpdir: + context = PdfContext(options, tmpdir, infile, None, None) + tmpout = Path(tmpdir) / 'out.pdf' optimize( infile, tmpout, diff --git a/src/ocrmypdf/pdfa.py b/src/ocrmypdf/pdfa.py index 68b6f39a..23ef7ab9 100644 --- a/src/ocrmypdf/pdfa.py +++ b/src/ocrmypdf/pdfa.py @@ -97,7 +97,8 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'): target_filename: filename to save icc: ICC identifier such as 'sRGB' References: - Adobe PDFMARK Reference: https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdfmark_reference.pdf + Adobe PDFMARK Reference: + https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdfmark_reference.pdf """ if icc != 'sRGB': raise NotImplementedError("Only supporting sRGB") @@ -105,11 +106,11 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'): bytes_icc_profile = ( package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME ).read_bytes() - ps = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3)) + postscript = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3)) # We should have encoded everything to pure ASCII by this point, and # to be safe, only allow ASCII in PostScript - Path(target_filename).write_text(ps, encoding='ascii') + Path(target_filename).write_text(postscript, encoding='ascii') return target_filename diff --git a/src/ocrmypdf/pdfinfo/__init__.py b/src/ocrmypdf/pdfinfo/__init__.py index 2c9a1be9..34a1d6c6 100644 --- a/src/ocrmypdf/pdfinfo/__init__.py +++ b/src/ocrmypdf/pdfinfo/__init__.py @@ -6,4 +6,6 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""For extracting information about PDFs prior to OCR.""" + from ocrmypdf.pdfinfo.info import Colorspace, Encoding, PdfInfo diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index d39085dc..132f9033 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -6,13 +6,15 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""Extract information about the content of a PDF.""" + import atexit import logging import re from collections import defaultdict from contextlib import ExitStack from decimal import Decimal -from enum import Enum +from enum import Enum, auto from functools import partial from math import hypot, inf, isclose from os import PathLike @@ -20,6 +22,7 @@ from pathlib import Path from typing import ( Container, Dict, + Iterable, Iterator, List, Mapping, @@ -48,11 +51,39 @@ from ocrmypdf.pdfinfo.layout import get_page_analysis, get_text_boxes logger = logging.getLogger() -Colorspace = Enum('Colorspace', 'gray rgb cmyk lab icc index sep devn pattern jpeg2000') -Encoding = Enum( - 'Encoding', 'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate runlength' -) +class Colorspace(Enum): + """Description of common image colorspaces in a PDF.""" + + # pylint: disable=invalid-name + gray = auto() + rgb = auto() + cmyk = auto() + lab = auto() + icc = auto() + index = auto() + sep = auto() + devn = auto() + pattern = auto() + jpeg2000 = auto() + + +class Encoding(Enum): + """Description of common image encodings in a PDF.""" + + # pylint: disable=invalid-name + ccitt = auto() + jpeg = auto() + jpeg2000 = auto() + jbig2 = auto() + asciihex = auto() + ascii85 = auto() + lzw = auto() + flate = auto() + runlength = auto() + + +FloatRect = Tuple[float, float, float, float] FRIENDLY_COLORSPACE: Dict[str, Colorspace] = { '/DeviceGray': Colorspace.gray, @@ -105,18 +136,24 @@ def _is_unit_square(shorthand): class XobjectSettings(NamedTuple): + """Info about an XObject found in a PDF.""" + name: str shorthand: Tuple[float, float, float, float, float, float] stack_depth: int class InlineSettings(NamedTuple): + """Info about an inline image found in a PDF.""" + iimage: PdfInlineImage shorthand: Tuple[float, float, float, float, float, float] stack_depth: int class ContentsInfo(NamedTuple): + """Info about various objects found in a PDF.""" + xobject_settings: List[XobjectSettings] inline_images: List[InlineSettings] found_vector: bool @@ -125,17 +162,19 @@ class ContentsInfo(NamedTuple): class TextboxInfo(NamedTuple): + """Info about a text box found in a PDF.""" + bbox: Tuple[float, float, float, float] is_visible: bool is_corrupt: bool class VectorMarker: - pass + """Sentinel indicating vector drawing operations were found on a page.""" class TextMarker: - pass + """Sentinel indicating text drawing operations were found on a page.""" def _normalize_stack(graphobjs): @@ -197,7 +236,7 @@ def _interpret_contents(contentstream: Object, initial_shorthand=UNIT_SQUARE): if len(stack) > 32: # See docstring if len(stack) > 128: raise RuntimeError( - "PDF graphics stack overflowed hard limit, operator %i" % n + f"PDF graphics stack overflowed hard limit at operator {n}" ) warn("PDF graphics stack overflowed spec limit") elif operator == 'Q': @@ -283,7 +322,7 @@ def _get_dpi(ctm_shorthand, image_size) -> Resolution: """ - a, b, c, d, _, _ = ctm_shorthand + a, b, c, d, _, _ = ctm_shorthand # pylint: disable=invalid-name # Calculate the width and height of the image in PDF units image_drawn = hypot(a, b), hypot(c, d) @@ -299,6 +338,8 @@ def _get_dpi(ctm_shorthand, image_size) -> Resolution: class ImageInfo: + """Information about an image found in a PDF.""" + DPI_PREC = Decimal('1.000') _comp: Optional[int] @@ -428,7 +469,7 @@ def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]: for n, inline in enumerate(contentsinfo.inline_images): yield ImageInfo( - name='inline-%02d' % n, shorthand=inline.shorthand, inline=inline.iimage + name=f'inline-{n:02d}', shorthand=inline.shorthand, inline=inline.iimage ) @@ -569,10 +610,10 @@ def _process_content_streams( yield from _find_form_xobject_images(pdf, container, contentsinfo) -def _page_has_text(text_blocks, page_width, page_height) -> bool: +def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> bool: """Smarter text detection that ignores text in margins""" - pw, ph = float(page_width), float(page_height) + pw, ph = float(page_width), float(page_height) # pylint: disable=invalid-name margin_ratio = 0.125 interior_bbox = ( @@ -582,7 +623,7 @@ def _page_has_text(text_blocks, page_width, page_height) -> bool: margin_ratio * ph, # bottom (first quadrant: bottom < top) ) - def rects_intersect(a, b) -> bool: + def rects_intersect(a: FloatRect, b: FloatRect) -> bool: """ Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3) https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other @@ -604,19 +645,19 @@ def simplify_textboxes(miner, textbox_getter) -> Iterator[TextboxInfo]: We do this to save memory and ensure that our objects are pickleable. """ for box in textbox_getter(miner): - first_line = box._objs[0] - first_char = first_line._objs[0] + first_line = box._objs[0] # pylint: disable=protected-access + first_char = first_line._objs[0] # pylint: disable=protected-access visible = first_char.rendermode != 3 corrupt = first_char.get_text() == '\ufffd' yield TextboxInfo(box.bbox, visible, corrupt) -worker_pdf = None +worker_pdf = None # pylint: disable=invalid-name def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel): - global worker_pdf # pylint: disable=global-statement + global worker_pdf # pylint: disable=global-statement,invalid-name pikepdf_enable_mmap() logging.getLogger('pdfminer').setLevel(pdfminer_loglevel) @@ -701,6 +742,8 @@ def _pdf_pageinfo_concurrent( class PageInfo: + """Information about type of contents on each page in a PDF.""" + _has_text: Optional[bool] _has_vector: Optional[bool] _images: List[ImageInfo] @@ -762,15 +805,15 @@ class PageInfo: self._has_vector = False self._has_text = False self._images = [] - for ci in _process_content_streams( + for info in _process_content_streams( pdf=pdf, container=page, shorthand=userunit_shorthand ): - if isinstance(ci, VectorMarker): + if isinstance(info, VectorMarker): self._has_vector = True - elif isinstance(ci, TextMarker): + elif isinstance(info, TextMarker): self._has_text = True - elif isinstance(ci, ImageInfo): - self._images.append(ci) + elif isinstance(info, ImageInfo): + self._images.append(info) else: raise NotImplementedError() else: diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index 9db079e2..e5e290ca 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -4,6 +4,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. +"""OCRmyPDF pluggy plugin specification.""" from abc import ABC, abstractmethod from argparse import ArgumentParser, Namespace diff --git a/src/ocrmypdf/subprocess/__init__.py b/src/ocrmypdf/subprocess/__init__.py index 4ba85c14..30f63520 100644 --- a/src/ocrmypdf/subprocess/__init__.py +++ b/src/ocrmypdf/subprocess/__init__.py @@ -28,7 +28,12 @@ log = logging.getLogger(__name__) def run( - args, *, env=None, logs_errors_to_stdout: bool = False, **kwargs + args, + *, + env=None, + logs_errors_to_stdout: bool = False, + check: bool = False, + **kwargs, ) -> CompletedProcess: """Wrapper around :py:func:`subprocess.run` @@ -50,7 +55,7 @@ def run( stderr = None stderr_name = 'stderr' if not logs_errors_to_stdout else 'stdout' try: - proc = subprocess_run(args, env=env, **kwargs) + proc = subprocess_run(args, env=env, check=check, **kwargs) except CalledProcessError as e: stderr = getattr(e, stderr_name, None) raise @@ -111,6 +116,7 @@ def _fix_process_args(args, env, kwargs): program = str(args[0]) if os.name == 'nt': + # pylint: disable=import-outside-toplevel from ocrmypdf.subprocess._windows import fix_windows_args args = fix_windows_args(program, args, env) @@ -171,42 +177,42 @@ def get_version( return version -missing_program = ''' +MISSING_PROGRAM = ''' The program '{program}' could not be executed or was not found on your system PATH. ''' -missing_optional_program = ''' +MISSING_OPTIONAL_PROGRAM = ''' The program '{program}' could not be executed or was not found on your system PATH. This program is required when you use the {required_for} arguments. You could try omitting these arguments, or install the package. ''' -missing_recommend_program = ''' +MISSING_RECOMMEND_PROGRAM = ''' The program '{program}' could not be executed or was not found on your system PATH. This program is recommended when using the {required_for} arguments, but not required, so we will proceed. For best results, install the program. ''' -old_version = ''' +OLD_VERSION = ''' OCRmyPDF requires '{program}' {need_version} or higher. Your system appears to have {found_version}. Please update this program. ''' -old_version_required_for = ''' +OLD_VERSION_REQUIRED_FOR = ''' OCRmyPDF requires '{program}' {need_version} or higher when run with the {required_for} arguments. If you omit these arguments, OCRmyPDF may be able to proceed. For best results, install the program. ''' -osx_install_advice = ''' +OSX_INSTALL_ADVICE = ''' If you have homebrew installed, try these command to install the missing package: brew install {package} ''' -linux_install_advice = ''' +LINUX_INSTALL_ADVICE = ''' On systems with the aptitude package manager (Debian, Ubuntu), try these commands: sudo apt-get update @@ -216,7 +222,7 @@ On RPM-based systems (Red Hat, Fedora), search for instructions on installing the RPM for {program}. ''' -windows_install_advice = ''' +WINDOWS_INSTALL_ADVICE = ''' If not already installed, install the Chocolatey package manager. Then use a command prompt to install the missing package: choco install {package} @@ -234,32 +240,35 @@ def _get_platform(): def _error_trailer(program, package, **kwargs): + del kwargs if isinstance(package, Mapping): package = package.get(_get_platform(), program) if _get_platform() == 'darwin': - log.info(osx_install_advice.format(**locals())) + log.info(OSX_INSTALL_ADVICE.format(**locals())) elif _get_platform() == 'linux': - log.info(linux_install_advice.format(**locals())) + log.info(LINUX_INSTALL_ADVICE.format(**locals())) elif _get_platform() == 'windows': - log.info(windows_install_advice.format(**locals())) + log.info(WINDOWS_INSTALL_ADVICE.format(**locals())) def _error_missing_program(program, package, required_for, recommended): + # pylint: disable=unused-argument if recommended: - log.warning(missing_recommend_program.format(**locals())) + log.warning(MISSING_RECOMMEND_PROGRAM.format(**locals())) elif required_for: - log.error(missing_optional_program.format(**locals())) + log.error(MISSING_OPTIONAL_PROGRAM.format(**locals())) else: - log.error(missing_program.format(**locals())) + log.error(MISSING_PROGRAM.format(**locals())) _error_trailer(**locals()) def _error_old_version(program, package, need_version, found_version, required_for): + # pylint: disable=unused-argument if required_for: - log.error(old_version_required_for.format(**locals())) + log.error(OLD_VERSION_REQUIRED_FOR.format(**locals())) else: - log.error(old_version.format(**locals())) + log.error(OLD_VERSION.format(**locals())) _error_trailer(**locals()) @@ -294,10 +303,15 @@ def check_external_program( found_version = version_checker() else: # deprecated found_version = version_checker - except (CalledProcessError, FileNotFoundError, MissingDependencyError): + except (CalledProcessError, FileNotFoundError) as e: _error_missing_program(program, package, required_for, recommended) if not recommended: - raise MissingDependencyError(program) + raise MissingDependencyError(program) from e + return + except MissingDependencyError: + _error_missing_program(program, package, required_for, recommended) + if not recommended: + raise return def remove_leading_v(s): diff --git a/src/ocrmypdf/subprocess/_windows.py b/src/ocrmypdf/subprocess/_windows.py index d6a9d63b..10b2d2be 100644 --- a/src/ocrmypdf/subprocess/_windows.py +++ b/src/ocrmypdf/subprocess/_windows.py @@ -7,6 +7,8 @@ # type: ignore # Non-Windows mypy now breaks when trying to typecheck winreg +"""Find Tesseract and Ghostscript binaries on Windows using the registry.""" + import logging import os import shutil @@ -17,9 +19,9 @@ from typing import Any, Callable, Iterable, Iterator, Set, Tuple, TypeVar try: import winreg -except ModuleNotFoundError as e: - raise ModuleNotFoundError("This module is for Windows only") from e - +except ModuleNotFoundError as _notfound_ex: + raise ModuleNotFoundError("This module is for Windows only") from _notfound_ex +del _notfound_ex log = logging.getLogger(__name__) @@ -40,15 +42,15 @@ def ghostscript_version_key(s: str) -> Tuple[int, int, int]: def registry_enum( key: winreg.HKEYType, enum_fn: Callable[[winreg.HKEYType, int], T] ) -> Iterator[T]: - LIMIT = 999 + limit = 999 n = 0 - while n < LIMIT: + while n < limit: try: yield enum_fn(key, n) n += 1 except OSError: break - if n == LIMIT: + if n == limit: raise ValueError(f"Too many registry keys under {key}") @@ -61,6 +63,7 @@ def registry_values(key: winreg.HKEYType) -> Iterator[Tuple[str, Any, int]]: def registry_path_ghostscript(env=None) -> Iterator[Path]: + del env # unused (but needed for protocol) try: with winreg.OpenKey( winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Artifex\GPL Ghostscript" @@ -78,6 +81,7 @@ def registry_path_ghostscript(env=None) -> Iterator[Path]: def registry_path_tesseract(env=None) -> Iterator[Path]: + del env # unused (but needed for protocol) try: with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Tesseract-OCR") as k: for subkey, val, _valtype in registry_values(k): diff --git a/tests/test_metadata.py b/tests/test_metadata.py index e2f4d091..65a1d255 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -8,7 +8,6 @@ import datetime import warnings from datetime import timezone -from os import fspath from shutil import copyfile import pikepdf @@ -326,6 +325,9 @@ def test_metadata_fixup_warning(resources, outdir, caplog): assert any(record.levelname == 'WARNING' for record in caplog.records) +XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' + + def test_prevent_gs_invalid_xml(resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') @@ -352,7 +354,7 @@ def test_prevent_gs_invalid_xml(resources, outdir): contents = (outdir / 'pdfa.pdf').read_bytes() # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. - XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' + xmp_start = contents.find(XMP_MAGIC) xmp_end = contents.rfind(b'