Configure pylint in pyproject and delint

This commit is contained in:
James R. Barlow
2022-06-11 01:15:30 -07:00
parent d640c2ded3
commit b17fb61389
33 changed files with 323 additions and 157 deletions

View File

@@ -99,3 +99,8 @@ module = [
'libxmp.utils'
]
ignore_missing_imports = true
[tool.pylint.basic]
good-names = ["i", "j", "k", "ex", "Run", "_", "e", "p", "im", "w", "h", "m", "x", "y", "a", "b", "fp", "n", "f", "s", "v", "q", "dx", "dy"]
logging-format-style = "old"
disable = ["raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "logging-fstring-interpolation", "missing-function-docstring", "too-few-public-methods"]

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""setup.py to support older setuptools and pip."""
from setuptools import setup

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Adds OCR layer to PDFs."""
from pluggy import HookimplMarker as _HookimplMarker

View File

@@ -5,6 +5,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""ocrmypdf command line entrypoint."""
import logging
import os

View File

@@ -4,6 +4,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""OCRmyPDF concurrency abstractions."""
import threading
from abc import ABC, abstractmethod
from typing import Callable, Iterable, Optional
@@ -14,6 +16,8 @@ def _task_noop(*_args, **_kwargs):
class NullProgressBar:
"""Progress bar API that takes no actions."""
def __init__(self, **kwargs):
pass
@@ -28,6 +32,8 @@ class NullProgressBar:
class Executor(ABC):
"""Abstract concurrent executor."""
pool_lock = threading.Lock()
pbar_class = NullProgressBar

View File

@@ -14,13 +14,12 @@ import sys
from io import BytesIO
from os import fspath
from pathlib import Path
from shutil import which
from subprocess import PIPE, CalledProcessError
from typing import Optional
from PIL import Image, UnidentifiedImageError
from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
from ocrmypdf.exceptions import SubprocessOutputError
from ocrmypdf.helpers import Resolution
from ocrmypdf.subprocess import get_version, run, run_polling_stderr
@@ -33,29 +32,18 @@ except AttributeError:
log = logging.getLogger(__name__)
missing_gs_error = """
---------------------------------------------------------------------
This error normally occurs when ocrmypdf find can't Ghostscript.
Please ensure Ghostscript is installed and its location is added to
the system PATH environment variable.
For details see:
https://ocrmypdf.readthedocs.io/en/latest/installation.html
---------------------------------------------------------------------
"""
# Most reliable what to get the bitness of Python interpreter, according to Python docs
_is_64bit = sys.maxsize > 2**32
_IS_64BIT = sys.maxsize > 2**32
_gswin = None
_GSWIN = None
if os.name == 'nt':
if _is_64bit:
_gswin = 'gswin64c'
if _IS_64BIT:
_GSWIN = 'gswin64c'
else:
_gswin = 'gswin32c'
_GSWIN = 'gswin32c'
GS = _gswin if _gswin else 'gs'
del _gswin
GS = _GSWIN if _GSWIN else 'gs'
del _GSWIN
def version():
@@ -126,7 +114,7 @@ def rasterize_pdf(
p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
except CalledProcessError as e:
log.error(e.stderr.decode(errors='replace'))
raise SubprocessOutputError('Ghostscript rasterizing failed')
raise SubprocessOutputError('Ghostscript rasterizing failed') from e
else:
stderr = p.stderr.decode(errors='replace')
if _gs_error_reported(stderr):
@@ -156,6 +144,8 @@ def rasterize_pdf(
class GhostscriptFollower:
"""Parses the output of Ghostscript and uses it to update the progress bar."""
re_process = re.compile(r"Processing pages \d+ through (\d+).")
re_page = re.compile(r"Page (\d+)")

View File

@@ -13,7 +13,7 @@ from math import pi
from os import fspath
from pathlib import Path
from subprocess import PIPE, STDOUT, CalledProcessError, TimeoutExpired
from typing import Dict, Iterator, List, Optional
from typing import Dict, List, Optional
from packaging.version import Version
from PIL import Image
@@ -55,6 +55,8 @@ TESSERACT_THRESHOLDING_METHODS: Dict[str, int] = {
class TesseractLoggerAdapter(logging.LoggerAdapter):
"Prepend [tesseract] to messages emitted from tesseract"
def process(self, msg, kwargs):
kwargs['extra'] = self.extra
return f'[tesseract] {msg}', kwargs
@@ -105,6 +107,7 @@ TESSERACT_VERSION_PATTERN = r"""
class TesseractVersion(Version):
"Modify standard packaging.Version regex to support Tesseract idiosyncracies."
_regex = re.compile(
r"^\s*" + TESSERACT_VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE
)
@@ -169,14 +172,14 @@ def tess_base_args(langs: List[str], engine_mode: Optional[int]) -> List[str]:
def _parse_tesseract_output(binary_output: bytes) -> Dict[str, str]:
def g():
def gen():
for line in binary_output.decode().splitlines():
line = line.strip()
parts = line.split(':', maxsplit=2)
if len(parts) == 2:
yield parts[0].strip(), parts[1].strip()
return {k: v for k, v in g()}
return dict(gen())
def get_orientation(
@@ -205,10 +208,10 @@ def get_orientation(
osd = _parse_tesseract_output(p.stdout)
angle = int(osd.get('Orientation in degrees', 0))
oc = OrientationConfidence(
orient_conf = OrientationConfidence(
angle=angle, confidence=float(osd.get('Orientation confidence', 0))
)
return oc
return orient_conf
def get_deskew(

View File

@@ -30,12 +30,16 @@ if sys.version_info >= (3, 10):
else:
from tempfile import TemporaryDirectory as _TemporaryDirectory
# Consume the ignore_cleanup_errors kwarg in Python 3.9 and older, without acting
# on this keyword. Users who need this issue full resolved should upgrade to Python
# 3.10.
# See: https://github.com/python/cpython/pull/24793
class TemporaryDirectory(_TemporaryDirectory):
"""Shim to consume ignore_cleanup_errors kwarg on Python 3.9 and older.
The argument is consumed without action. If users are getting errors related
to temporary file cleanup, they should upgrade to Python 3.10 which properly
cleans up temporary directories on Windows.
See: https://github.com/python/cpython/pull/24793
"""
def __init__(self, ignore_cleanup_errors=False, **kwargs):
super().__init__(**kwargs)
@@ -50,6 +54,8 @@ log = logging.getLogger(__name__)
class UnpaperImageTooLargeError(Exception):
"""To capture details when an image is too large for unpaper."""
def __init__(
self,
w,
@@ -66,8 +72,10 @@ def version() -> str:
return get_version('unpaper')
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
def _convert_image(im: Image.Image) -> Tuple[Image.Image, bool, str]:
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
im_modified = False
if im.mode not in SUFFIXES:

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""For grafting text-only PDF pages onto freeform PDF pages."""
import logging
import uuid
@@ -16,7 +17,6 @@ from pikepdf import (
Name,
Object,
Operator,
Page,
Pdf,
PdfError,
PdfMatrix,
@@ -81,6 +81,8 @@ def strip_invisible_text(pdf, page):
class OcrGrafter:
"""Manages grafting text-only PDFs onto regular PDFs."""
def __init__(self, context):
self.context = context
self.path_base = context.origin
@@ -236,6 +238,8 @@ class OcrGrafter:
):
"""Insert the text layer from text page 0 on to pdf_base at page_num"""
# pylint: disable=invalid-name
log.debug("Grafting")
if Path(textpdf).stat().st_size == 0:
return

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Defines context objects that are passed to child processes/threads."""
import os
import shutil

View File

@@ -4,15 +4,17 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Logging support classes."""
import logging
import sys
from contextlib import suppress
from tqdm import tqdm
class PageNumberFilter(logging.Filter):
"""Insert PDF page number that emitted log message to log record."""
def filter(self, record):
pageno = getattr(record, 'pageno', None)
if isinstance(pageno, int):

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""OCRmyPDF page processing pipeline functions."""
import logging
import os
@@ -332,7 +333,8 @@ def is_ocr_required(page_context: PageContext):
ocr_required = False
log.warning(
"page too big, skipping OCR "
f"({(pixel_count / 1_000_000):.1f} MPixels > {options.skip_big:.1f} MPixels --skip-big)"
f"({(pixel_count / 1_000_000):.1f} MPixels > "
f"{options.skip_big:.1f} MPixels --skip-big)"
)
return ocr_required
@@ -430,8 +432,8 @@ def rasterize(
output_file = page_context.get_path(f'rasterize{output_tag}.png')
pageinfo = page_context.pageinfo
def at_least(cs):
return max(device_idx, colorspaces.index(cs))
def at_least(colorspace):
return max(device_idx, colorspaces.index(colorspace))
for image in pageinfo.images:
if image.type_ != 'image':
@@ -471,10 +473,10 @@ def rasterize(
def preprocess_remove_background(input_file: Path, page_context: PageContext):
if any(image.bpc > 1 for image in page_context.pageinfo.images):
output_file = page_context.get_path('pp_rm_bg.png')
# leptonica.remove_background(input_file, output_file)
raise NotImplementedError("--remove-background is temporarily not implemented")
return output_file
# output_file = page_context.get_path('pp_rm_bg.png')
# leptonica.remove_background(input_file, output_file)
# return output_file
else:
log.info("background removal skipped on mono page")
return input_file
@@ -858,8 +860,8 @@ def enumerate_compress_ranges(iterable):
def merge_sidecars(txt_files: Iterable[Optional[Path]], context: PdfContext):
output_file = context.get_path('sidecar.txt')
with open(output_file, 'w', encoding="utf-8") as stream:
for (frm, to), txt_file in enumerate_compress_ranges(txt_files):
if frm != 1:
for (from_, to_), txt_file in enumerate_compress_ranges(txt_files):
if from_ != 1:
stream.write('\f') # Form feed between pages
if txt_file:
with open(txt_file, encoding="utf-8") as in_:
@@ -872,10 +874,10 @@ def merge_sidecars(txt_files: Iterable[Optional[Path]], context: PdfContext):
else:
stream.write(txt)
else:
if frm != to:
pages = f'{frm}-{to}'
if from_ != to_:
pages = f'{from_}-{to_}'
else:
pages = f'{frm}'
pages = f'{from_}'
stream.write(f'[OCR skipped on page(s) {pages}]')
return output_file

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Plugin manager using pluggy."""
import argparse
import importlib
@@ -101,12 +102,11 @@ class OcrmypdfPluginManager(pluggy.PluginManager):
def get_plugin_manager(plugins: List[Union[str, Path]], builtins=True):
pm = OcrmypdfPluginManager(
return OcrmypdfPluginManager(
project_name='ocrmypdf',
plugins=plugins,
builtins=builtins,
)
return pm
def get_parser_options_plugins(

View File

@@ -4,6 +4,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Implements the concurrent and page synchronous parts of the pipeline."""
import argparse
import logging
@@ -68,7 +70,9 @@ from ocrmypdf.pdfa import file_claims_pdfa
log = logging.getLogger(__name__)
class PageResult(NamedTuple): # pylint: disable=inherit-non-class
class PageResult(NamedTuple):
"""Result when a page is finished processing."""
pageno: int
pdf_page_from_image: Optional[Path]
ocr: Optional[Path]
@@ -425,7 +429,7 @@ def run_pipeline(
else:
log.error(type(e).__name__)
return e.exit_code
except (PIL.Image.DecompressionBombError if not api else NeverRaise) as e:
except (PIL.Image.DecompressionBombError if not api else NeverRaise):
log.exception(
"A decompression bomb error was encountered while executing the "
"pipeline. Use the argument --max-image-mpixels to raise the maximum "
@@ -435,7 +439,7 @@ def run_pipeline(
except (
BrokenProcessPool if not api else NeverRaise,
BrokenThreadPool if not api else NeverRaise,
) as e:
):
log.exception(
"A worker process was terminated unexpectedly. This is known to occur if "
"processing your file takes all available swap space and RAM. It may "

View File

@@ -5,6 +5,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Validate a work order from API or command line."""
import locale
import logging
@@ -25,7 +26,7 @@ from ocrmypdf.exceptions import (
MissingDependencyError,
OutputFileAccessError,
)
from ocrmypdf.helpers import is_file_writable, monotonic, safe_symlink, samefile
from ocrmypdf.helpers import is_file_writable, monotonic, safe_symlink
from ocrmypdf.hocrtransform import HOCR_OK_LANGS
from ocrmypdf.subprocess import check_external_program
@@ -146,13 +147,13 @@ def check_options_preprocessing(options):
def _pages_from_ranges(ranges: str) -> Set[int]:
pages: List[int] = []
page_groups = ranges.replace(' ', '').split(',')
for g in page_groups:
if not g:
for group in page_groups:
if not group:
continue
try:
start, end = g.split('-')
start, end = group.split('-')
except ValueError:
pages.append(int(g) - 1)
pages.append(int(group) - 1)
else:
try:
new_pages = list(range(int(start) - 1, int(end)))
@@ -162,7 +163,7 @@ def _pages_from_ranges(ranges: str) -> Set[int]:
) from None
pages.extend(new_pages)
except ValueError:
raise BadArgsError(f"invalid page subrange '{g}'") from None
raise BadArgsError(f"invalid page subrange '{group}'") from None
if not pages:
raise BadArgsError(
@@ -237,13 +238,13 @@ def check_options_advanced(options):
def check_options_metadata(options):
docinfo = [options.title, options.author, options.keywords, options.subject]
for s in (m for m in docinfo if m):
for c in s:
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
for char in s:
if unicodedata.category(char) == 'Co' or ord(char) >= 0x10000:
hexchar = hex(ord(char))[2:].upper()
raise ValueError(
"One of the metadata strings contains "
"an unsupported Unicode character: '{}' (U+{})".format(
c, hex(ord(c))[2:].upper()
)
"an unsupported Unicode character: "
f"{char} (U+{hexchar})"
)
@@ -293,7 +294,7 @@ def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
target = work_folder / 'origin'
safe_symlink(options.input_file, target)
return target, os.fspath(options.input_file)
except FileNotFoundError:
except FileNotFoundError as e:
msg = f"File not found - {options.input_file}"
if Path('/.dockerenv').exists(): # pragma: no cover
msg += (
@@ -304,7 +305,7 @@ def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
"\n"
"\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\n"
)
raise InputFileError(msg)
raise InputFileError(msg) from e
def check_requested_output_file(options):
@@ -324,7 +325,9 @@ def check_requested_output_file(options):
)
def report_output_file_size(options, input_file, output_file):
def report_output_file_size(
options, input_file, output_file, file_overhead=4000, page_overhead=3000
):
try:
output_size = Path(output_file).stat().st_size
input_size = Path(input_file).stat().st_size
@@ -333,9 +336,7 @@ def report_output_file_size(options, input_file, output_file):
with pikepdf.open(output_file) as p:
# Overhead constants obtained by estimating amount of data added by OCR
# PDF/A conversion, and possible XMP metadata addition, with compression
FILE_OVERHEAD = 4000
OCR_PER_PAGE_OVERHEAD = 3000
reasonable_overhead = FILE_OVERHEAD + OCR_PER_PAGE_OVERHEAD * len(p.pages)
reasonable_overhead = file_overhead + page_overhead * len(p.pages)
ratio = output_size / input_size
reasonable_ratio = output_size / (input_size + reasonable_overhead)
if reasonable_ratio < 1.35 or input_size < 25000:

View File

@@ -4,6 +4,10 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Get version by introspecting package information.
OCRmyPDF uses setuptools_scm to derive version from git tags.
"""
try:
from importlib_metadata import version as _package_version

View File

@@ -11,6 +11,8 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""OCRmyPDF's multiprocessing/multithreading abstraction layer."""
import logging
import logging.handlers
import multiprocessing
@@ -21,7 +23,6 @@ import sys
import threading
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from contextlib import suppress
from multiprocessing.pool import Pool, ThreadPool
from typing import Callable, Iterable, Type, Union
from tqdm import tqdm
@@ -44,7 +45,8 @@ def log_listener(q: Queue):
should actually write to sys.stderr or whatever we're using, so if this is
made into a process the main application needs to be directed to it.
See https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
See:
https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
"""
while True:
@@ -89,6 +91,8 @@ def process_init(q: Queue, user_init: UserInit, loglevel) -> None:
def thread_init(q: Queue, user_init: UserInit, loglevel) -> None:
del q # unused but required argument
del loglevel # unused but required argument
# As a thread, block SIGBUS so the main thread deals with it...
with suppress(AttributeError):
signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGBUS})
@@ -98,6 +102,8 @@ def thread_init(q: Queue, user_init: UserInit, loglevel) -> None:
class StandardExecutor(Executor):
"""Standard OCRmyPDF concurrent task executor."""
def _execute(
self,
*,

View File

@@ -4,6 +4,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""OCRmyPDF automatically installs these filters as plugins."""
from ocrmypdf import hookimpl

View File

@@ -5,6 +5,8 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Built-in plugin to implement PDF page rasterization and PDF/A production."""
import logging
from ocrmypdf import hookimpl

View File

@@ -4,6 +4,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Built-in plugin to implement OCR using Tesseract."""
import logging
import os
@@ -138,6 +140,8 @@ def validate(pdfinfo, options):
class TesseractOcrEngine(OcrEngine):
"""Implements OCR with Tesseract."""
@staticmethod
def version():
return tesseract.version()

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Command line interface customization and validation."""
import argparse
from typing import Any, Callable, Mapping, Optional, TypeVar
@@ -42,7 +43,7 @@ def str_to_int(mapping: Mapping[str, int]):
except KeyError:
raise argparse.ArgumentTypeError(
f"{s!r} must be one of: {', '.join(mapping.keys())}"
)
) from None
return _str_to_int
@@ -51,6 +52,11 @@ class ArgumentParser(argparse.ArgumentParser):
"""Override parser's default behavior of calling sys.exit()
https://stackoverflow.com/questions/5943249/python-argparse-and-controlling-overriding-the-exit-status-code
OCRmyPDF began as a CLI but eventually acquired an API. The API works inside out,
by synthesizing a command line argument. So we subclass the standard parser with
one that doesn't call sys.exit(). Obviously this is not the ideal way to do things
but it works for us.
"""
def __init__(self, *args, **kwargs):
@@ -65,6 +71,8 @@ class ArgumentParser(argparse.ArgumentParser):
class LanguageSetAction(argparse.Action):
"""Manages a list of languages."""
def __init__(self, option_strings, dest, default=None, **kwargs):
if default is None:
default = set()

View File

@@ -4,12 +4,16 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""OCRmyPDF's exceptions."""
from enum import IntEnum
from textwrap import dedent
class ExitCode(IntEnum):
"""OCRmyPDF's exit codes."""
# pylint: disable=invalid-name
ok = 0
bad_args = 1
input_file = 2
@@ -26,6 +30,8 @@ class ExitCode(IntEnum):
class ExitCodeException(Exception):
"""An exception which should return an exit code with sys.exit()."""
exit_code = ExitCode.other_error
message = ""
@@ -37,17 +43,24 @@ class ExitCodeException(Exception):
class BadArgsError(ExitCodeException):
"""Invalid arguments on the command line or API."""
exit_code = ExitCode.bad_args
class PdfMergeFailedError(ExitCodeException):
class PdfMergeFailedError(ExitCodeException): # deprecated
"""An intermediate PDF can't be merged.
No longer in use.
"""
exit_code = ExitCode.input_file
message = dedent(
'''\
Failed to merge PDF image layer with OCR layer
Usually this happens because the input PDF file is malformed and
ocrmypdf cannot automatically correct the problem on its own.
ocrmypdf cannot correct the problem on its own.
Try using
ocrmypdf --pdf-renderer sandwich [..other args..]
@@ -56,34 +69,50 @@ class PdfMergeFailedError(ExitCodeException):
class MissingDependencyError(ExitCodeException):
"""A third-party dependency is missing."""
exit_code = ExitCode.missing_dependency
class UnsupportedImageFormatError(ExitCodeException):
"""The image format is not supported."""
exit_code = ExitCode.input_file
class DpiError(ExitCodeException):
"""Missing information about input image DPI."""
exit_code = ExitCode.input_file
class OutputFileAccessError(ExitCodeException):
"""Cannot access the intended output file path."""
exit_code = ExitCode.file_access_error
class PriorOcrFoundError(ExitCodeException):
"""This file already has OCR."""
exit_code = ExitCode.already_done_ocr
class InputFileError(ExitCodeException):
"""Something is wrong with the input file."""
exit_code = ExitCode.input_file
class SubprocessOutputError(ExitCodeException):
"""A subprocess returned an unexpected error."""
exit_code = ExitCode.child_process_error
class EncryptedPdfError(ExitCodeException):
"""Input PDF is encrypted."""
exit_code = ExitCode.encrypted_pdf
message = dedent(
'''\
@@ -100,5 +129,7 @@ class EncryptedPdfError(ExitCodeException):
class TesseractConfigError(ExitCodeException):
"""Tesseract config can't be parsed."""
exit_code = ExitCode.invalid_config
message = "Error occurred while parsing a Tesseract configuration file"

View File

@@ -37,9 +37,11 @@ from ocrmypdf.helpers import remove_all_log_handlers
class MessageType(Enum):
exception = auto()
result = auto()
complete = auto()
"""Implement basic IPC messaging."""
exception = auto() # pylint: disable=invalid-name
result = auto() # pylint: disable=invalid-name
complete = auto() # pylint: disable=invalid-name
def split_every(n: int, iterable: Iterable) -> Iterator:
@@ -59,6 +61,8 @@ def process_sigbus(*args):
class ConnectionLogHandler(logging.handlers.QueueHandler):
"""Handler used by child processes to forward log messages to parent."""
def __init__(self, conn: Connection) -> None:
# sets the parent's queue to None - parent only touches queue
# in enqueue() which we override
@@ -91,7 +95,7 @@ def process_loop(
for args in task_args:
try:
result = task(args)
except Exception as e:
except Exception as e: # pylint: disable=broad-except
conn.send((MessageType.exception, e))
break
else:
@@ -103,6 +107,8 @@ def process_loop(
class LambdaExecutor(Executor):
"""Executor for AWS Lambda or similar environments that lack semaphores."""
def _execute(
self,
*,
@@ -153,13 +159,13 @@ class LambdaExecutor(Executor):
with self.pbar_class(**tqdm_kwargs) as pbar:
while connections:
for r in wait(connections):
if not isinstance(r, Connection):
for result in wait(connections):
if not isinstance(result, Connection):
raise NotImplementedError("We only support Connection()")
try:
msg_type, msg = r.recv()
msg_type, msg = result.recv()
except EOFError:
connections.remove(r)
connections.remove(result)
continue
if msg_type == MessageType.result:
@@ -170,7 +176,7 @@ class LambdaExecutor(Executor):
logger = logging.getLogger(record.name)
logger.handle(record)
elif msg_type == MessageType.complete:
connections.remove(r)
connections.remove(result)
elif msg_type == MessageType.exception:
for process in processes:
process.terminate()

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Support functions."""
import logging
import multiprocessing
@@ -137,11 +138,11 @@ def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
os.symlink(os.path.abspath(input_file), soft_link_name)
def samefile(f1: os.PathLike, f2: os.PathLike):
def samefile(file1: os.PathLike, file2: os.PathLike):
if os.name == 'nt':
return f1 == f2
return file1 == file2
else:
return os.path.samefile(f1, f2)
return os.path.samefile(file1, file2)
def is_iterable_notstr(thing: Any) -> bool:
@@ -149,9 +150,9 @@ def is_iterable_notstr(thing: Any) -> bool:
return isinstance(thing, Iterable) and not isinstance(thing, str)
def monotonic(L: Sequence) -> bool:
def monotonic(seq: Sequence) -> bool:
"""Does this sequence increase monotonically?"""
return all(b > a for a, b in zip(L, L[1:]))
return all(b > a for a, b in zip(seq, seq[1:]))
def page_number(input_file: os.PathLike) -> int:
@@ -166,7 +167,7 @@ def available_cpu_count() -> int:
except NotImplementedError:
pass
warnings.warn(
"Could not get CPU count. Assuming one (1) CPU." "Use -j N to set manually."
"Could not get CPU count. Assuming one (1) CPU. Use -j N to set manually."
)
return 1
@@ -190,16 +191,16 @@ def is_file_writable(test_file: os.PathLike) -> bool:
os.W_OK,
effective_ids=(os.access in os.supports_effective_ids),
)
try:
fp = p.open('wb')
except OSError:
return False
else:
try:
fp = p.open('wb')
except OSError:
return False
else:
fp.close()
with suppress(OSError):
p.unlink()
return True
fp.close()
with suppress(OSError):
p.unlink()
return True
except (OSError, RuntimeError) as e:
log.debug(e)
log.error(str(e))

View File

@@ -28,6 +28,8 @@
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""Transform .hocr and page image to text PDF."""
import argparse
import os
import re
@@ -99,7 +101,7 @@ HOCR_OK_LANGS = frozenset(
Element = ElementTree.Element
class Rect(NamedTuple): # pylint: disable=inherit-non-class
class Rect(NamedTuple):
"""A rectangle for managing PDF coordinates."""
x1: Any
@@ -109,7 +111,7 @@ class Rect(NamedTuple): # pylint: disable=inherit-non-class
class HocrTransformError(Exception):
pass
"""Error while applying hOCR transform."""
class HocrTransform:
@@ -287,7 +289,7 @@ class HocrTransform:
continue
pxl_coords = self.element_coordinates(elem)
pt = self.pt_from_pixel(pxl_coords)
pt = self.pt_from_pixel(pxl_coords) # pylint: disable=invalid-name
# draw the bbox border
if show_bounding_boxes: # pragma: no cover

View File

@@ -4,6 +4,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Post-processing image optimization of OCR PDFs."""
import logging
import sys
@@ -55,7 +57,9 @@ DEFAULT_PNG_QUALITY = 70
Xref = NewType('Xref', int)
class XrefExt(NamedTuple): # pylint: disable=inherit-non-class
class XrefExt(NamedTuple):
"""A PDF xref and image extension pair."""
xref: Xref
ext: str
@@ -466,7 +470,7 @@ def _find_deflatable_jpeg(
result = extract_image_filter(pike, root, image, xref)
if result is None:
return None
pim, filtdp = result
_pim, filtdp = result
if filtdp[0] == Name.DCTDecode and not filtdp[1] and options.optimize >= 1:
return XrefExt(xref, '.memory')
@@ -707,9 +711,9 @@ def main(infile, outfile, level, jobs=1):
jb2lossy=False,
)
with TemporaryDirectory() as td:
context = PdfContext(options, td, infile, None, None)
tmpout = Path(td) / 'out.pdf'
with TemporaryDirectory() as tmpdir:
context = PdfContext(options, tmpdir, infile, None, None)
tmpout = Path(tmpdir) / 'out.pdf'
optimize(
infile,
tmpout,

View File

@@ -97,7 +97,8 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
target_filename: filename to save
icc: ICC identifier such as 'sRGB'
References:
Adobe PDFMARK Reference: https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdfmark_reference.pdf
Adobe PDFMARK Reference:
https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdfmark_reference.pdf
"""
if icc != 'sRGB':
raise NotImplementedError("Only supporting sRGB")
@@ -105,11 +106,11 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
bytes_icc_profile = (
package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME
).read_bytes()
ps = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3))
postscript = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3))
# We should have encoded everything to pure ASCII by this point, and
# to be safe, only allow ASCII in PostScript
Path(target_filename).write_text(ps, encoding='ascii')
Path(target_filename).write_text(postscript, encoding='ascii')
return target_filename

View File

@@ -6,4 +6,6 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""For extracting information about PDFs prior to OCR."""
from ocrmypdf.pdfinfo.info import Colorspace, Encoding, PdfInfo

View File

@@ -6,13 +6,15 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Extract information about the content of a PDF."""
import atexit
import logging
import re
from collections import defaultdict
from contextlib import ExitStack
from decimal import Decimal
from enum import Enum
from enum import Enum, auto
from functools import partial
from math import hypot, inf, isclose
from os import PathLike
@@ -20,6 +22,7 @@ from pathlib import Path
from typing import (
Container,
Dict,
Iterable,
Iterator,
List,
Mapping,
@@ -48,11 +51,39 @@ from ocrmypdf.pdfinfo.layout import get_page_analysis, get_text_boxes
logger = logging.getLogger()
Colorspace = Enum('Colorspace', 'gray rgb cmyk lab icc index sep devn pattern jpeg2000')
Encoding = Enum(
'Encoding', 'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate runlength'
)
class Colorspace(Enum):
"""Description of common image colorspaces in a PDF."""
# pylint: disable=invalid-name
gray = auto()
rgb = auto()
cmyk = auto()
lab = auto()
icc = auto()
index = auto()
sep = auto()
devn = auto()
pattern = auto()
jpeg2000 = auto()
class Encoding(Enum):
"""Description of common image encodings in a PDF."""
# pylint: disable=invalid-name
ccitt = auto()
jpeg = auto()
jpeg2000 = auto()
jbig2 = auto()
asciihex = auto()
ascii85 = auto()
lzw = auto()
flate = auto()
runlength = auto()
FloatRect = Tuple[float, float, float, float]
FRIENDLY_COLORSPACE: Dict[str, Colorspace] = {
'/DeviceGray': Colorspace.gray,
@@ -105,18 +136,24 @@ def _is_unit_square(shorthand):
class XobjectSettings(NamedTuple):
"""Info about an XObject found in a PDF."""
name: str
shorthand: Tuple[float, float, float, float, float, float]
stack_depth: int
class InlineSettings(NamedTuple):
"""Info about an inline image found in a PDF."""
iimage: PdfInlineImage
shorthand: Tuple[float, float, float, float, float, float]
stack_depth: int
class ContentsInfo(NamedTuple):
"""Info about various objects found in a PDF."""
xobject_settings: List[XobjectSettings]
inline_images: List[InlineSettings]
found_vector: bool
@@ -125,17 +162,19 @@ class ContentsInfo(NamedTuple):
class TextboxInfo(NamedTuple):
"""Info about a text box found in a PDF."""
bbox: Tuple[float, float, float, float]
is_visible: bool
is_corrupt: bool
class VectorMarker:
pass
"""Sentinel indicating vector drawing operations were found on a page."""
class TextMarker:
pass
"""Sentinel indicating text drawing operations were found on a page."""
def _normalize_stack(graphobjs):
@@ -197,7 +236,7 @@ def _interpret_contents(contentstream: Object, initial_shorthand=UNIT_SQUARE):
if len(stack) > 32: # See docstring
if len(stack) > 128:
raise RuntimeError(
"PDF graphics stack overflowed hard limit, operator %i" % n
f"PDF graphics stack overflowed hard limit at operator {n}"
)
warn("PDF graphics stack overflowed spec limit")
elif operator == 'Q':
@@ -283,7 +322,7 @@ def _get_dpi(ctm_shorthand, image_size) -> Resolution:
"""
a, b, c, d, _, _ = ctm_shorthand
a, b, c, d, _, _ = ctm_shorthand # pylint: disable=invalid-name
# Calculate the width and height of the image in PDF units
image_drawn = hypot(a, b), hypot(c, d)
@@ -299,6 +338,8 @@ def _get_dpi(ctm_shorthand, image_size) -> Resolution:
class ImageInfo:
"""Information about an image found in a PDF."""
DPI_PREC = Decimal('1.000')
_comp: Optional[int]
@@ -428,7 +469,7 @@ def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]:
for n, inline in enumerate(contentsinfo.inline_images):
yield ImageInfo(
name='inline-%02d' % n, shorthand=inline.shorthand, inline=inline.iimage
name=f'inline-{n:02d}', shorthand=inline.shorthand, inline=inline.iimage
)
@@ -569,10 +610,10 @@ def _process_content_streams(
yield from _find_form_xobject_images(pdf, container, contentsinfo)
def _page_has_text(text_blocks, page_width, page_height) -> bool:
def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> bool:
"""Smarter text detection that ignores text in margins"""
pw, ph = float(page_width), float(page_height)
pw, ph = float(page_width), float(page_height) # pylint: disable=invalid-name
margin_ratio = 0.125
interior_bbox = (
@@ -582,7 +623,7 @@ def _page_has_text(text_blocks, page_width, page_height) -> bool:
margin_ratio * ph, # bottom (first quadrant: bottom < top)
)
def rects_intersect(a, b) -> bool:
def rects_intersect(a: FloatRect, b: FloatRect) -> bool:
"""
Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)
https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other
@@ -604,19 +645,19 @@ def simplify_textboxes(miner, textbox_getter) -> Iterator[TextboxInfo]:
We do this to save memory and ensure that our objects are pickleable.
"""
for box in textbox_getter(miner):
first_line = box._objs[0]
first_char = first_line._objs[0]
first_line = box._objs[0] # pylint: disable=protected-access
first_char = first_line._objs[0] # pylint: disable=protected-access
visible = first_char.rendermode != 3
corrupt = first_char.get_text() == '\ufffd'
yield TextboxInfo(box.bbox, visible, corrupt)
worker_pdf = None
worker_pdf = None # pylint: disable=invalid-name
def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel):
global worker_pdf # pylint: disable=global-statement
global worker_pdf # pylint: disable=global-statement,invalid-name
pikepdf_enable_mmap()
logging.getLogger('pdfminer').setLevel(pdfminer_loglevel)
@@ -701,6 +742,8 @@ def _pdf_pageinfo_concurrent(
class PageInfo:
"""Information about type of contents on each page in a PDF."""
_has_text: Optional[bool]
_has_vector: Optional[bool]
_images: List[ImageInfo]
@@ -762,15 +805,15 @@ class PageInfo:
self._has_vector = False
self._has_text = False
self._images = []
for ci in _process_content_streams(
for info in _process_content_streams(
pdf=pdf, container=page, shorthand=userunit_shorthand
):
if isinstance(ci, VectorMarker):
if isinstance(info, VectorMarker):
self._has_vector = True
elif isinstance(ci, TextMarker):
elif isinstance(info, TextMarker):
self._has_text = True
elif isinstance(ci, ImageInfo):
self._images.append(ci)
elif isinstance(info, ImageInfo):
self._images.append(info)
else:
raise NotImplementedError()
else:

View File

@@ -4,6 +4,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""OCRmyPDF pluggy plugin specification."""
from abc import ABC, abstractmethod
from argparse import ArgumentParser, Namespace

View File

@@ -28,7 +28,12 @@ log = logging.getLogger(__name__)
def run(
args, *, env=None, logs_errors_to_stdout: bool = False, **kwargs
args,
*,
env=None,
logs_errors_to_stdout: bool = False,
check: bool = False,
**kwargs,
) -> CompletedProcess:
"""Wrapper around :py:func:`subprocess.run`
@@ -50,7 +55,7 @@ def run(
stderr = None
stderr_name = 'stderr' if not logs_errors_to_stdout else 'stdout'
try:
proc = subprocess_run(args, env=env, **kwargs)
proc = subprocess_run(args, env=env, check=check, **kwargs)
except CalledProcessError as e:
stderr = getattr(e, stderr_name, None)
raise
@@ -111,6 +116,7 @@ def _fix_process_args(args, env, kwargs):
program = str(args[0])
if os.name == 'nt':
# pylint: disable=import-outside-toplevel
from ocrmypdf.subprocess._windows import fix_windows_args
args = fix_windows_args(program, args, env)
@@ -171,42 +177,42 @@ def get_version(
return version
missing_program = '''
MISSING_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH.
'''
missing_optional_program = '''
MISSING_OPTIONAL_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH. This program is required when you use the
{required_for} arguments. You could try omitting these arguments, or install
the package.
'''
missing_recommend_program = '''
MISSING_RECOMMEND_PROGRAM = '''
The program '{program}' could not be executed or was not found on your
system PATH. This program is recommended when using the {required_for} arguments,
but not required, so we will proceed. For best results, install the program.
'''
old_version = '''
OLD_VERSION = '''
OCRmyPDF requires '{program}' {need_version} or higher. Your system appears
to have {found_version}. Please update this program.
'''
old_version_required_for = '''
OLD_VERSION_REQUIRED_FOR = '''
OCRmyPDF requires '{program}' {need_version} or higher when run with the
{required_for} arguments. If you omit these arguments, OCRmyPDF may be able to
proceed. For best results, install the program.
'''
osx_install_advice = '''
OSX_INSTALL_ADVICE = '''
If you have homebrew installed, try these command to install the missing
package:
brew install {package}
'''
linux_install_advice = '''
LINUX_INSTALL_ADVICE = '''
On systems with the aptitude package manager (Debian, Ubuntu), try these
commands:
sudo apt-get update
@@ -216,7 +222,7 @@ On RPM-based systems (Red Hat, Fedora), search for instructions on
installing the RPM for {program}.
'''
windows_install_advice = '''
WINDOWS_INSTALL_ADVICE = '''
If not already installed, install the Chocolatey package manager. Then use
a command prompt to install the missing package:
choco install {package}
@@ -234,32 +240,35 @@ def _get_platform():
def _error_trailer(program, package, **kwargs):
del kwargs
if isinstance(package, Mapping):
package = package.get(_get_platform(), program)
if _get_platform() == 'darwin':
log.info(osx_install_advice.format(**locals()))
log.info(OSX_INSTALL_ADVICE.format(**locals()))
elif _get_platform() == 'linux':
log.info(linux_install_advice.format(**locals()))
log.info(LINUX_INSTALL_ADVICE.format(**locals()))
elif _get_platform() == 'windows':
log.info(windows_install_advice.format(**locals()))
log.info(WINDOWS_INSTALL_ADVICE.format(**locals()))
def _error_missing_program(program, package, required_for, recommended):
# pylint: disable=unused-argument
if recommended:
log.warning(missing_recommend_program.format(**locals()))
log.warning(MISSING_RECOMMEND_PROGRAM.format(**locals()))
elif required_for:
log.error(missing_optional_program.format(**locals()))
log.error(MISSING_OPTIONAL_PROGRAM.format(**locals()))
else:
log.error(missing_program.format(**locals()))
log.error(MISSING_PROGRAM.format(**locals()))
_error_trailer(**locals())
def _error_old_version(program, package, need_version, found_version, required_for):
# pylint: disable=unused-argument
if required_for:
log.error(old_version_required_for.format(**locals()))
log.error(OLD_VERSION_REQUIRED_FOR.format(**locals()))
else:
log.error(old_version.format(**locals()))
log.error(OLD_VERSION.format(**locals()))
_error_trailer(**locals())
@@ -294,10 +303,15 @@ def check_external_program(
found_version = version_checker()
else: # deprecated
found_version = version_checker
except (CalledProcessError, FileNotFoundError, MissingDependencyError):
except (CalledProcessError, FileNotFoundError) as e:
_error_missing_program(program, package, required_for, recommended)
if not recommended:
raise MissingDependencyError(program)
raise MissingDependencyError(program) from e
return
except MissingDependencyError:
_error_missing_program(program, package, required_for, recommended)
if not recommended:
raise
return
def remove_leading_v(s):

View File

@@ -7,6 +7,8 @@
# type: ignore
# Non-Windows mypy now breaks when trying to typecheck winreg
"""Find Tesseract and Ghostscript binaries on Windows using the registry."""
import logging
import os
import shutil
@@ -17,9 +19,9 @@ from typing import Any, Callable, Iterable, Iterator, Set, Tuple, TypeVar
try:
import winreg
except ModuleNotFoundError as e:
raise ModuleNotFoundError("This module is for Windows only") from e
except ModuleNotFoundError as _notfound_ex:
raise ModuleNotFoundError("This module is for Windows only") from _notfound_ex
del _notfound_ex
log = logging.getLogger(__name__)
@@ -40,15 +42,15 @@ def ghostscript_version_key(s: str) -> Tuple[int, int, int]:
def registry_enum(
key: winreg.HKEYType, enum_fn: Callable[[winreg.HKEYType, int], T]
) -> Iterator[T]:
LIMIT = 999
limit = 999
n = 0
while n < LIMIT:
while n < limit:
try:
yield enum_fn(key, n)
n += 1
except OSError:
break
if n == LIMIT:
if n == limit:
raise ValueError(f"Too many registry keys under {key}")
@@ -61,6 +63,7 @@ def registry_values(key: winreg.HKEYType) -> Iterator[Tuple[str, Any, int]]:
def registry_path_ghostscript(env=None) -> Iterator[Path]:
del env # unused (but needed for protocol)
try:
with winreg.OpenKey(
winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Artifex\GPL Ghostscript"
@@ -78,6 +81,7 @@ def registry_path_ghostscript(env=None) -> Iterator[Path]:
def registry_path_tesseract(env=None) -> Iterator[Path]:
del env # unused (but needed for protocol)
try:
with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Tesseract-OCR") as k:
for subkey, val, _valtype in registry_values(k):

View File

@@ -8,7 +8,6 @@
import datetime
import warnings
from datetime import timezone
from os import fspath
from shutil import copyfile
import pikepdf
@@ -326,6 +325,9 @@ def test_metadata_fixup_warning(resources, outdir, caplog):
assert any(record.levelname == 'WARNING' for record in caplog.records)
XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
def test_prevent_gs_invalid_xml(resources, outdir):
generate_pdfa_ps(outdir / 'pdfa.ps')
copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')
@@ -352,7 +354,7 @@ def test_prevent_gs_invalid_xml(resources, outdir):
contents = (outdir / 'pdfa.pdf').read_bytes()
# Since the XML may be invalid, we scan instead of actually feeding it
# to a parser.
XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
xmp_start = contents.find(XMP_MAGIC)
xmp_end = contents.rfind(b'<?xpacket end', xmp_start)
assert 0 < xmp_start < xmp_end