mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-04 04:35:49 -04:00
Configure pylint in pyproject and delint
This commit is contained in:
@@ -99,3 +99,8 @@ module = [
|
||||
'libxmp.utils'
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.pylint.basic]
|
||||
good-names = ["i", "j", "k", "ex", "Run", "_", "e", "p", "im", "w", "h", "m", "x", "y", "a", "b", "fp", "n", "f", "s", "v", "q", "dx", "dy"]
|
||||
logging-format-style = "old"
|
||||
disable = ["raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "logging-fstring-interpolation", "missing-function-docstring", "too-few-public-methods"]
|
||||
|
||||
1
setup.py
1
setup.py
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""setup.py to support older setuptools and pip."""
|
||||
|
||||
from setuptools import setup
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Adds OCR layer to PDFs."""
|
||||
|
||||
from pluggy import HookimplMarker as _HookimplMarker
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""ocrmypdf command line entrypoint."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""OCRmyPDF concurrency abstractions."""
|
||||
|
||||
import threading
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, Iterable, Optional
|
||||
@@ -14,6 +16,8 @@ def _task_noop(*_args, **_kwargs):
|
||||
|
||||
|
||||
class NullProgressBar:
|
||||
"""Progress bar API that takes no actions."""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
@@ -28,6 +32,8 @@ class NullProgressBar:
|
||||
|
||||
|
||||
class Executor(ABC):
|
||||
"""Abstract concurrent executor."""
|
||||
|
||||
pool_lock = threading.Lock()
|
||||
pbar_class = NullProgressBar
|
||||
|
||||
|
||||
@@ -14,13 +14,12 @@ import sys
|
||||
from io import BytesIO
|
||||
from os import fspath
|
||||
from pathlib import Path
|
||||
from shutil import which
|
||||
from subprocess import PIPE, CalledProcessError
|
||||
from typing import Optional
|
||||
|
||||
from PIL import Image, UnidentifiedImageError
|
||||
|
||||
from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
|
||||
from ocrmypdf.exceptions import SubprocessOutputError
|
||||
from ocrmypdf.helpers import Resolution
|
||||
from ocrmypdf.subprocess import get_version, run, run_polling_stderr
|
||||
|
||||
@@ -33,29 +32,18 @@ except AttributeError:
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
missing_gs_error = """
|
||||
---------------------------------------------------------------------
|
||||
This error normally occurs when ocrmypdf find can't Ghostscript.
|
||||
Please ensure Ghostscript is installed and its location is added to
|
||||
the system PATH environment variable.
|
||||
|
||||
For details see:
|
||||
https://ocrmypdf.readthedocs.io/en/latest/installation.html
|
||||
---------------------------------------------------------------------
|
||||
"""
|
||||
|
||||
# Most reliable what to get the bitness of Python interpreter, according to Python docs
|
||||
_is_64bit = sys.maxsize > 2**32
|
||||
_IS_64BIT = sys.maxsize > 2**32
|
||||
|
||||
_gswin = None
|
||||
_GSWIN = None
|
||||
if os.name == 'nt':
|
||||
if _is_64bit:
|
||||
_gswin = 'gswin64c'
|
||||
if _IS_64BIT:
|
||||
_GSWIN = 'gswin64c'
|
||||
else:
|
||||
_gswin = 'gswin32c'
|
||||
_GSWIN = 'gswin32c'
|
||||
|
||||
GS = _gswin if _gswin else 'gs'
|
||||
del _gswin
|
||||
GS = _GSWIN if _GSWIN else 'gs'
|
||||
del _GSWIN
|
||||
|
||||
|
||||
def version():
|
||||
@@ -126,7 +114,7 @@ def rasterize_pdf(
|
||||
p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
|
||||
except CalledProcessError as e:
|
||||
log.error(e.stderr.decode(errors='replace'))
|
||||
raise SubprocessOutputError('Ghostscript rasterizing failed')
|
||||
raise SubprocessOutputError('Ghostscript rasterizing failed') from e
|
||||
else:
|
||||
stderr = p.stderr.decode(errors='replace')
|
||||
if _gs_error_reported(stderr):
|
||||
@@ -156,6 +144,8 @@ def rasterize_pdf(
|
||||
|
||||
|
||||
class GhostscriptFollower:
|
||||
"""Parses the output of Ghostscript and uses it to update the progress bar."""
|
||||
|
||||
re_process = re.compile(r"Processing pages \d+ through (\d+).")
|
||||
re_page = re.compile(r"Page (\d+)")
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from math import pi
|
||||
from os import fspath
|
||||
from pathlib import Path
|
||||
from subprocess import PIPE, STDOUT, CalledProcessError, TimeoutExpired
|
||||
from typing import Dict, Iterator, List, Optional
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from packaging.version import Version
|
||||
from PIL import Image
|
||||
@@ -55,6 +55,8 @@ TESSERACT_THRESHOLDING_METHODS: Dict[str, int] = {
|
||||
|
||||
|
||||
class TesseractLoggerAdapter(logging.LoggerAdapter):
|
||||
"Prepend [tesseract] to messages emitted from tesseract"
|
||||
|
||||
def process(self, msg, kwargs):
|
||||
kwargs['extra'] = self.extra
|
||||
return f'[tesseract] {msg}', kwargs
|
||||
@@ -105,6 +107,7 @@ TESSERACT_VERSION_PATTERN = r"""
|
||||
|
||||
|
||||
class TesseractVersion(Version):
|
||||
"Modify standard packaging.Version regex to support Tesseract idiosyncracies."
|
||||
_regex = re.compile(
|
||||
r"^\s*" + TESSERACT_VERSION_PATTERN + r"\s*$", re.VERBOSE | re.IGNORECASE
|
||||
)
|
||||
@@ -169,14 +172,14 @@ def tess_base_args(langs: List[str], engine_mode: Optional[int]) -> List[str]:
|
||||
|
||||
|
||||
def _parse_tesseract_output(binary_output: bytes) -> Dict[str, str]:
|
||||
def g():
|
||||
def gen():
|
||||
for line in binary_output.decode().splitlines():
|
||||
line = line.strip()
|
||||
parts = line.split(':', maxsplit=2)
|
||||
if len(parts) == 2:
|
||||
yield parts[0].strip(), parts[1].strip()
|
||||
|
||||
return {k: v for k, v in g()}
|
||||
return dict(gen())
|
||||
|
||||
|
||||
def get_orientation(
|
||||
@@ -205,10 +208,10 @@ def get_orientation(
|
||||
|
||||
osd = _parse_tesseract_output(p.stdout)
|
||||
angle = int(osd.get('Orientation in degrees', 0))
|
||||
oc = OrientationConfidence(
|
||||
orient_conf = OrientationConfidence(
|
||||
angle=angle, confidence=float(osd.get('Orientation confidence', 0))
|
||||
)
|
||||
return oc
|
||||
return orient_conf
|
||||
|
||||
|
||||
def get_deskew(
|
||||
|
||||
@@ -30,12 +30,16 @@ if sys.version_info >= (3, 10):
|
||||
else:
|
||||
from tempfile import TemporaryDirectory as _TemporaryDirectory
|
||||
|
||||
# Consume the ignore_cleanup_errors kwarg in Python 3.9 and older, without acting
|
||||
# on this keyword. Users who need this issue full resolved should upgrade to Python
|
||||
# 3.10.
|
||||
# See: https://github.com/python/cpython/pull/24793
|
||||
|
||||
class TemporaryDirectory(_TemporaryDirectory):
|
||||
"""Shim to consume ignore_cleanup_errors kwarg on Python 3.9 and older.
|
||||
|
||||
The argument is consumed without action. If users are getting errors related
|
||||
to temporary file cleanup, they should upgrade to Python 3.10 which properly
|
||||
cleans up temporary directories on Windows.
|
||||
|
||||
See: https://github.com/python/cpython/pull/24793
|
||||
"""
|
||||
|
||||
def __init__(self, ignore_cleanup_errors=False, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -50,6 +54,8 @@ log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnpaperImageTooLargeError(Exception):
|
||||
"""To capture details when an image is too large for unpaper."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
w,
|
||||
@@ -66,8 +72,10 @@ def version() -> str:
|
||||
return get_version('unpaper')
|
||||
|
||||
|
||||
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
||||
|
||||
|
||||
def _convert_image(im: Image.Image) -> Tuple[Image.Image, bool, str]:
|
||||
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
||||
im_modified = False
|
||||
|
||||
if im.mode not in SUFFIXES:
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""For grafting text-only PDF pages onto freeform PDF pages."""
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
@@ -16,7 +17,6 @@ from pikepdf import (
|
||||
Name,
|
||||
Object,
|
||||
Operator,
|
||||
Page,
|
||||
Pdf,
|
||||
PdfError,
|
||||
PdfMatrix,
|
||||
@@ -81,6 +81,8 @@ def strip_invisible_text(pdf, page):
|
||||
|
||||
|
||||
class OcrGrafter:
|
||||
"""Manages grafting text-only PDFs onto regular PDFs."""
|
||||
|
||||
def __init__(self, context):
|
||||
self.context = context
|
||||
self.path_base = context.origin
|
||||
@@ -236,6 +238,8 @@ class OcrGrafter:
|
||||
):
|
||||
"""Insert the text layer from text page 0 on to pdf_base at page_num"""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
|
||||
log.debug("Grafting")
|
||||
if Path(textpdf).stat().st_size == 0:
|
||||
return
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Defines context objects that are passed to child processes/threads."""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
@@ -4,15 +4,17 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Logging support classes."""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from contextlib import suppress
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class PageNumberFilter(logging.Filter):
|
||||
"""Insert PDF page number that emitted log message to log record."""
|
||||
|
||||
def filter(self, record):
|
||||
pageno = getattr(record, 'pageno', None)
|
||||
if isinstance(pageno, int):
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""OCRmyPDF page processing pipeline functions."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
@@ -332,7 +333,8 @@ def is_ocr_required(page_context: PageContext):
|
||||
ocr_required = False
|
||||
log.warning(
|
||||
"page too big, skipping OCR "
|
||||
f"({(pixel_count / 1_000_000):.1f} MPixels > {options.skip_big:.1f} MPixels --skip-big)"
|
||||
f"({(pixel_count / 1_000_000):.1f} MPixels > "
|
||||
f"{options.skip_big:.1f} MPixels --skip-big)"
|
||||
)
|
||||
return ocr_required
|
||||
|
||||
@@ -430,8 +432,8 @@ def rasterize(
|
||||
output_file = page_context.get_path(f'rasterize{output_tag}.png')
|
||||
pageinfo = page_context.pageinfo
|
||||
|
||||
def at_least(cs):
|
||||
return max(device_idx, colorspaces.index(cs))
|
||||
def at_least(colorspace):
|
||||
return max(device_idx, colorspaces.index(colorspace))
|
||||
|
||||
for image in pageinfo.images:
|
||||
if image.type_ != 'image':
|
||||
@@ -471,10 +473,10 @@ def rasterize(
|
||||
|
||||
def preprocess_remove_background(input_file: Path, page_context: PageContext):
|
||||
if any(image.bpc > 1 for image in page_context.pageinfo.images):
|
||||
output_file = page_context.get_path('pp_rm_bg.png')
|
||||
# leptonica.remove_background(input_file, output_file)
|
||||
raise NotImplementedError("--remove-background is temporarily not implemented")
|
||||
return output_file
|
||||
# output_file = page_context.get_path('pp_rm_bg.png')
|
||||
# leptonica.remove_background(input_file, output_file)
|
||||
# return output_file
|
||||
else:
|
||||
log.info("background removal skipped on mono page")
|
||||
return input_file
|
||||
@@ -858,8 +860,8 @@ def enumerate_compress_ranges(iterable):
|
||||
def merge_sidecars(txt_files: Iterable[Optional[Path]], context: PdfContext):
|
||||
output_file = context.get_path('sidecar.txt')
|
||||
with open(output_file, 'w', encoding="utf-8") as stream:
|
||||
for (frm, to), txt_file in enumerate_compress_ranges(txt_files):
|
||||
if frm != 1:
|
||||
for (from_, to_), txt_file in enumerate_compress_ranges(txt_files):
|
||||
if from_ != 1:
|
||||
stream.write('\f') # Form feed between pages
|
||||
if txt_file:
|
||||
with open(txt_file, encoding="utf-8") as in_:
|
||||
@@ -872,10 +874,10 @@ def merge_sidecars(txt_files: Iterable[Optional[Path]], context: PdfContext):
|
||||
else:
|
||||
stream.write(txt)
|
||||
else:
|
||||
if frm != to:
|
||||
pages = f'{frm}-{to}'
|
||||
if from_ != to_:
|
||||
pages = f'{from_}-{to_}'
|
||||
else:
|
||||
pages = f'{frm}'
|
||||
pages = f'{from_}'
|
||||
stream.write(f'[OCR skipped on page(s) {pages}]')
|
||||
return output_file
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Plugin manager using pluggy."""
|
||||
|
||||
import argparse
|
||||
import importlib
|
||||
@@ -101,12 +102,11 @@ class OcrmypdfPluginManager(pluggy.PluginManager):
|
||||
|
||||
|
||||
def get_plugin_manager(plugins: List[Union[str, Path]], builtins=True):
|
||||
pm = OcrmypdfPluginManager(
|
||||
return OcrmypdfPluginManager(
|
||||
project_name='ocrmypdf',
|
||||
plugins=plugins,
|
||||
builtins=builtins,
|
||||
)
|
||||
return pm
|
||||
|
||||
|
||||
def get_parser_options_plugins(
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Implements the concurrent and page synchronous parts of the pipeline."""
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
@@ -68,7 +70,9 @@ from ocrmypdf.pdfa import file_claims_pdfa
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PageResult(NamedTuple): # pylint: disable=inherit-non-class
|
||||
class PageResult(NamedTuple):
|
||||
"""Result when a page is finished processing."""
|
||||
|
||||
pageno: int
|
||||
pdf_page_from_image: Optional[Path]
|
||||
ocr: Optional[Path]
|
||||
@@ -425,7 +429,7 @@ def run_pipeline(
|
||||
else:
|
||||
log.error(type(e).__name__)
|
||||
return e.exit_code
|
||||
except (PIL.Image.DecompressionBombError if not api else NeverRaise) as e:
|
||||
except (PIL.Image.DecompressionBombError if not api else NeverRaise):
|
||||
log.exception(
|
||||
"A decompression bomb error was encountered while executing the "
|
||||
"pipeline. Use the argument --max-image-mpixels to raise the maximum "
|
||||
@@ -435,7 +439,7 @@ def run_pipeline(
|
||||
except (
|
||||
BrokenProcessPool if not api else NeverRaise,
|
||||
BrokenThreadPool if not api else NeverRaise,
|
||||
) as e:
|
||||
):
|
||||
log.exception(
|
||||
"A worker process was terminated unexpectedly. This is known to occur if "
|
||||
"processing your file takes all available swap space and RAM. It may "
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Validate a work order from API or command line."""
|
||||
|
||||
import locale
|
||||
import logging
|
||||
@@ -25,7 +26,7 @@ from ocrmypdf.exceptions import (
|
||||
MissingDependencyError,
|
||||
OutputFileAccessError,
|
||||
)
|
||||
from ocrmypdf.helpers import is_file_writable, monotonic, safe_symlink, samefile
|
||||
from ocrmypdf.helpers import is_file_writable, monotonic, safe_symlink
|
||||
from ocrmypdf.hocrtransform import HOCR_OK_LANGS
|
||||
from ocrmypdf.subprocess import check_external_program
|
||||
|
||||
@@ -146,13 +147,13 @@ def check_options_preprocessing(options):
|
||||
def _pages_from_ranges(ranges: str) -> Set[int]:
|
||||
pages: List[int] = []
|
||||
page_groups = ranges.replace(' ', '').split(',')
|
||||
for g in page_groups:
|
||||
if not g:
|
||||
for group in page_groups:
|
||||
if not group:
|
||||
continue
|
||||
try:
|
||||
start, end = g.split('-')
|
||||
start, end = group.split('-')
|
||||
except ValueError:
|
||||
pages.append(int(g) - 1)
|
||||
pages.append(int(group) - 1)
|
||||
else:
|
||||
try:
|
||||
new_pages = list(range(int(start) - 1, int(end)))
|
||||
@@ -162,7 +163,7 @@ def _pages_from_ranges(ranges: str) -> Set[int]:
|
||||
) from None
|
||||
pages.extend(new_pages)
|
||||
except ValueError:
|
||||
raise BadArgsError(f"invalid page subrange '{g}'") from None
|
||||
raise BadArgsError(f"invalid page subrange '{group}'") from None
|
||||
|
||||
if not pages:
|
||||
raise BadArgsError(
|
||||
@@ -237,13 +238,13 @@ def check_options_advanced(options):
|
||||
def check_options_metadata(options):
|
||||
docinfo = [options.title, options.author, options.keywords, options.subject]
|
||||
for s in (m for m in docinfo if m):
|
||||
for c in s:
|
||||
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
|
||||
for char in s:
|
||||
if unicodedata.category(char) == 'Co' or ord(char) >= 0x10000:
|
||||
hexchar = hex(ord(char))[2:].upper()
|
||||
raise ValueError(
|
||||
"One of the metadata strings contains "
|
||||
"an unsupported Unicode character: '{}' (U+{})".format(
|
||||
c, hex(ord(c))[2:].upper()
|
||||
)
|
||||
"an unsupported Unicode character: "
|
||||
f"{char} (U+{hexchar})"
|
||||
)
|
||||
|
||||
|
||||
@@ -293,7 +294,7 @@ def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
|
||||
target = work_folder / 'origin'
|
||||
safe_symlink(options.input_file, target)
|
||||
return target, os.fspath(options.input_file)
|
||||
except FileNotFoundError:
|
||||
except FileNotFoundError as e:
|
||||
msg = f"File not found - {options.input_file}"
|
||||
if Path('/.dockerenv').exists(): # pragma: no cover
|
||||
msg += (
|
||||
@@ -304,7 +305,7 @@ def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
|
||||
"\n"
|
||||
"\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\n"
|
||||
)
|
||||
raise InputFileError(msg)
|
||||
raise InputFileError(msg) from e
|
||||
|
||||
|
||||
def check_requested_output_file(options):
|
||||
@@ -324,7 +325,9 @@ def check_requested_output_file(options):
|
||||
)
|
||||
|
||||
|
||||
def report_output_file_size(options, input_file, output_file):
|
||||
def report_output_file_size(
|
||||
options, input_file, output_file, file_overhead=4000, page_overhead=3000
|
||||
):
|
||||
try:
|
||||
output_size = Path(output_file).stat().st_size
|
||||
input_size = Path(input_file).stat().st_size
|
||||
@@ -333,9 +336,7 @@ def report_output_file_size(options, input_file, output_file):
|
||||
with pikepdf.open(output_file) as p:
|
||||
# Overhead constants obtained by estimating amount of data added by OCR
|
||||
# PDF/A conversion, and possible XMP metadata addition, with compression
|
||||
FILE_OVERHEAD = 4000
|
||||
OCR_PER_PAGE_OVERHEAD = 3000
|
||||
reasonable_overhead = FILE_OVERHEAD + OCR_PER_PAGE_OVERHEAD * len(p.pages)
|
||||
reasonable_overhead = file_overhead + page_overhead * len(p.pages)
|
||||
ratio = output_size / input_size
|
||||
reasonable_ratio = output_size / (input_size + reasonable_overhead)
|
||||
if reasonable_ratio < 1.35 or input_size < 25000:
|
||||
|
||||
@@ -4,6 +4,10 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Get version by introspecting package information.
|
||||
|
||||
OCRmyPDF uses setuptools_scm to derive version from git tags.
|
||||
"""
|
||||
|
||||
try:
|
||||
from importlib_metadata import version as _package_version
|
||||
|
||||
@@ -11,6 +11,8 @@
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""OCRmyPDF's multiprocessing/multithreading abstraction layer."""
|
||||
|
||||
import logging
|
||||
import logging.handlers
|
||||
import multiprocessing
|
||||
@@ -21,7 +23,6 @@ import sys
|
||||
import threading
|
||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
||||
from contextlib import suppress
|
||||
from multiprocessing.pool import Pool, ThreadPool
|
||||
from typing import Callable, Iterable, Type, Union
|
||||
|
||||
from tqdm import tqdm
|
||||
@@ -44,7 +45,8 @@ def log_listener(q: Queue):
|
||||
should actually write to sys.stderr or whatever we're using, so if this is
|
||||
made into a process the main application needs to be directed to it.
|
||||
|
||||
See https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
|
||||
See:
|
||||
https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
|
||||
"""
|
||||
|
||||
while True:
|
||||
@@ -89,6 +91,8 @@ def process_init(q: Queue, user_init: UserInit, loglevel) -> None:
|
||||
|
||||
|
||||
def thread_init(q: Queue, user_init: UserInit, loglevel) -> None:
|
||||
del q # unused but required argument
|
||||
del loglevel # unused but required argument
|
||||
# As a thread, block SIGBUS so the main thread deals with it...
|
||||
with suppress(AttributeError):
|
||||
signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGBUS})
|
||||
@@ -98,6 +102,8 @@ def thread_init(q: Queue, user_init: UserInit, loglevel) -> None:
|
||||
|
||||
|
||||
class StandardExecutor(Executor):
|
||||
"""Standard OCRmyPDF concurrent task executor."""
|
||||
|
||||
def _execute(
|
||||
self,
|
||||
*,
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""OCRmyPDF automatically installs these filters as plugins."""
|
||||
|
||||
from ocrmypdf import hookimpl
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,8 @@
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Built-in plugin to implement PDF page rasterization and PDF/A production."""
|
||||
|
||||
import logging
|
||||
|
||||
from ocrmypdf import hookimpl
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Built-in plugin to implement OCR using Tesseract."""
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
@@ -138,6 +140,8 @@ def validate(pdfinfo, options):
|
||||
|
||||
|
||||
class TesseractOcrEngine(OcrEngine):
|
||||
"""Implements OCR with Tesseract."""
|
||||
|
||||
@staticmethod
|
||||
def version():
|
||||
return tesseract.version()
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Command line interface customization and validation."""
|
||||
|
||||
import argparse
|
||||
from typing import Any, Callable, Mapping, Optional, TypeVar
|
||||
@@ -42,7 +43,7 @@ def str_to_int(mapping: Mapping[str, int]):
|
||||
except KeyError:
|
||||
raise argparse.ArgumentTypeError(
|
||||
f"{s!r} must be one of: {', '.join(mapping.keys())}"
|
||||
)
|
||||
) from None
|
||||
|
||||
return _str_to_int
|
||||
|
||||
@@ -51,6 +52,11 @@ class ArgumentParser(argparse.ArgumentParser):
|
||||
"""Override parser's default behavior of calling sys.exit()
|
||||
|
||||
https://stackoverflow.com/questions/5943249/python-argparse-and-controlling-overriding-the-exit-status-code
|
||||
|
||||
OCRmyPDF began as a CLI but eventually acquired an API. The API works inside out,
|
||||
by synthesizing a command line argument. So we subclass the standard parser with
|
||||
one that doesn't call sys.exit(). Obviously this is not the ideal way to do things
|
||||
but it works for us.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
@@ -65,6 +71,8 @@ class ArgumentParser(argparse.ArgumentParser):
|
||||
|
||||
|
||||
class LanguageSetAction(argparse.Action):
|
||||
"""Manages a list of languages."""
|
||||
|
||||
def __init__(self, option_strings, dest, default=None, **kwargs):
|
||||
if default is None:
|
||||
default = set()
|
||||
|
||||
@@ -4,12 +4,16 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""OCRmyPDF's exceptions."""
|
||||
|
||||
from enum import IntEnum
|
||||
from textwrap import dedent
|
||||
|
||||
|
||||
class ExitCode(IntEnum):
|
||||
"""OCRmyPDF's exit codes."""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
ok = 0
|
||||
bad_args = 1
|
||||
input_file = 2
|
||||
@@ -26,6 +30,8 @@ class ExitCode(IntEnum):
|
||||
|
||||
|
||||
class ExitCodeException(Exception):
|
||||
"""An exception which should return an exit code with sys.exit()."""
|
||||
|
||||
exit_code = ExitCode.other_error
|
||||
message = ""
|
||||
|
||||
@@ -37,17 +43,24 @@ class ExitCodeException(Exception):
|
||||
|
||||
|
||||
class BadArgsError(ExitCodeException):
|
||||
"""Invalid arguments on the command line or API."""
|
||||
|
||||
exit_code = ExitCode.bad_args
|
||||
|
||||
|
||||
class PdfMergeFailedError(ExitCodeException):
|
||||
class PdfMergeFailedError(ExitCodeException): # deprecated
|
||||
"""An intermediate PDF can't be merged.
|
||||
|
||||
No longer in use.
|
||||
"""
|
||||
|
||||
exit_code = ExitCode.input_file
|
||||
message = dedent(
|
||||
'''\
|
||||
Failed to merge PDF image layer with OCR layer
|
||||
|
||||
Usually this happens because the input PDF file is malformed and
|
||||
ocrmypdf cannot automatically correct the problem on its own.
|
||||
ocrmypdf cannot correct the problem on its own.
|
||||
|
||||
Try using
|
||||
ocrmypdf --pdf-renderer sandwich [..other args..]
|
||||
@@ -56,34 +69,50 @@ class PdfMergeFailedError(ExitCodeException):
|
||||
|
||||
|
||||
class MissingDependencyError(ExitCodeException):
|
||||
"""A third-party dependency is missing."""
|
||||
|
||||
exit_code = ExitCode.missing_dependency
|
||||
|
||||
|
||||
class UnsupportedImageFormatError(ExitCodeException):
|
||||
"""The image format is not supported."""
|
||||
|
||||
exit_code = ExitCode.input_file
|
||||
|
||||
|
||||
class DpiError(ExitCodeException):
|
||||
"""Missing information about input image DPI."""
|
||||
|
||||
exit_code = ExitCode.input_file
|
||||
|
||||
|
||||
class OutputFileAccessError(ExitCodeException):
|
||||
"""Cannot access the intended output file path."""
|
||||
|
||||
exit_code = ExitCode.file_access_error
|
||||
|
||||
|
||||
class PriorOcrFoundError(ExitCodeException):
|
||||
"""This file already has OCR."""
|
||||
|
||||
exit_code = ExitCode.already_done_ocr
|
||||
|
||||
|
||||
class InputFileError(ExitCodeException):
|
||||
"""Something is wrong with the input file."""
|
||||
|
||||
exit_code = ExitCode.input_file
|
||||
|
||||
|
||||
class SubprocessOutputError(ExitCodeException):
|
||||
"""A subprocess returned an unexpected error."""
|
||||
|
||||
exit_code = ExitCode.child_process_error
|
||||
|
||||
|
||||
class EncryptedPdfError(ExitCodeException):
|
||||
"""Input PDF is encrypted."""
|
||||
|
||||
exit_code = ExitCode.encrypted_pdf
|
||||
message = dedent(
|
||||
'''\
|
||||
@@ -100,5 +129,7 @@ class EncryptedPdfError(ExitCodeException):
|
||||
|
||||
|
||||
class TesseractConfigError(ExitCodeException):
|
||||
"""Tesseract config can't be parsed."""
|
||||
|
||||
exit_code = ExitCode.invalid_config
|
||||
message = "Error occurred while parsing a Tesseract configuration file"
|
||||
|
||||
@@ -37,9 +37,11 @@ from ocrmypdf.helpers import remove_all_log_handlers
|
||||
|
||||
|
||||
class MessageType(Enum):
|
||||
exception = auto()
|
||||
result = auto()
|
||||
complete = auto()
|
||||
"""Implement basic IPC messaging."""
|
||||
|
||||
exception = auto() # pylint: disable=invalid-name
|
||||
result = auto() # pylint: disable=invalid-name
|
||||
complete = auto() # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def split_every(n: int, iterable: Iterable) -> Iterator:
|
||||
@@ -59,6 +61,8 @@ def process_sigbus(*args):
|
||||
|
||||
|
||||
class ConnectionLogHandler(logging.handlers.QueueHandler):
|
||||
"""Handler used by child processes to forward log messages to parent."""
|
||||
|
||||
def __init__(self, conn: Connection) -> None:
|
||||
# sets the parent's queue to None - parent only touches queue
|
||||
# in enqueue() which we override
|
||||
@@ -91,7 +95,7 @@ def process_loop(
|
||||
for args in task_args:
|
||||
try:
|
||||
result = task(args)
|
||||
except Exception as e:
|
||||
except Exception as e: # pylint: disable=broad-except
|
||||
conn.send((MessageType.exception, e))
|
||||
break
|
||||
else:
|
||||
@@ -103,6 +107,8 @@ def process_loop(
|
||||
|
||||
|
||||
class LambdaExecutor(Executor):
|
||||
"""Executor for AWS Lambda or similar environments that lack semaphores."""
|
||||
|
||||
def _execute(
|
||||
self,
|
||||
*,
|
||||
@@ -153,13 +159,13 @@ class LambdaExecutor(Executor):
|
||||
|
||||
with self.pbar_class(**tqdm_kwargs) as pbar:
|
||||
while connections:
|
||||
for r in wait(connections):
|
||||
if not isinstance(r, Connection):
|
||||
for result in wait(connections):
|
||||
if not isinstance(result, Connection):
|
||||
raise NotImplementedError("We only support Connection()")
|
||||
try:
|
||||
msg_type, msg = r.recv()
|
||||
msg_type, msg = result.recv()
|
||||
except EOFError:
|
||||
connections.remove(r)
|
||||
connections.remove(result)
|
||||
continue
|
||||
|
||||
if msg_type == MessageType.result:
|
||||
@@ -170,7 +176,7 @@ class LambdaExecutor(Executor):
|
||||
logger = logging.getLogger(record.name)
|
||||
logger.handle(record)
|
||||
elif msg_type == MessageType.complete:
|
||||
connections.remove(r)
|
||||
connections.remove(result)
|
||||
elif msg_type == MessageType.exception:
|
||||
for process in processes:
|
||||
process.terminate()
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Support functions."""
|
||||
|
||||
import logging
|
||||
import multiprocessing
|
||||
@@ -137,11 +138,11 @@ def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
|
||||
os.symlink(os.path.abspath(input_file), soft_link_name)
|
||||
|
||||
|
||||
def samefile(f1: os.PathLike, f2: os.PathLike):
|
||||
def samefile(file1: os.PathLike, file2: os.PathLike):
|
||||
if os.name == 'nt':
|
||||
return f1 == f2
|
||||
return file1 == file2
|
||||
else:
|
||||
return os.path.samefile(f1, f2)
|
||||
return os.path.samefile(file1, file2)
|
||||
|
||||
|
||||
def is_iterable_notstr(thing: Any) -> bool:
|
||||
@@ -149,9 +150,9 @@ def is_iterable_notstr(thing: Any) -> bool:
|
||||
return isinstance(thing, Iterable) and not isinstance(thing, str)
|
||||
|
||||
|
||||
def monotonic(L: Sequence) -> bool:
|
||||
def monotonic(seq: Sequence) -> bool:
|
||||
"""Does this sequence increase monotonically?"""
|
||||
return all(b > a for a, b in zip(L, L[1:]))
|
||||
return all(b > a for a, b in zip(seq, seq[1:]))
|
||||
|
||||
|
||||
def page_number(input_file: os.PathLike) -> int:
|
||||
@@ -166,7 +167,7 @@ def available_cpu_count() -> int:
|
||||
except NotImplementedError:
|
||||
pass
|
||||
warnings.warn(
|
||||
"Could not get CPU count. Assuming one (1) CPU." "Use -j N to set manually."
|
||||
"Could not get CPU count. Assuming one (1) CPU. Use -j N to set manually."
|
||||
)
|
||||
return 1
|
||||
|
||||
@@ -190,16 +191,16 @@ def is_file_writable(test_file: os.PathLike) -> bool:
|
||||
os.W_OK,
|
||||
effective_ids=(os.access in os.supports_effective_ids),
|
||||
)
|
||||
|
||||
try:
|
||||
fp = p.open('wb')
|
||||
except OSError:
|
||||
return False
|
||||
else:
|
||||
try:
|
||||
fp = p.open('wb')
|
||||
except OSError:
|
||||
return False
|
||||
else:
|
||||
fp.close()
|
||||
with suppress(OSError):
|
||||
p.unlink()
|
||||
return True
|
||||
fp.close()
|
||||
with suppress(OSError):
|
||||
p.unlink()
|
||||
return True
|
||||
except (OSError, RuntimeError) as e:
|
||||
log.debug(e)
|
||||
log.error(str(e))
|
||||
|
||||
@@ -28,6 +28,8 @@
|
||||
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
"""Transform .hocr and page image to text PDF."""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
@@ -99,7 +101,7 @@ HOCR_OK_LANGS = frozenset(
|
||||
Element = ElementTree.Element
|
||||
|
||||
|
||||
class Rect(NamedTuple): # pylint: disable=inherit-non-class
|
||||
class Rect(NamedTuple):
|
||||
"""A rectangle for managing PDF coordinates."""
|
||||
|
||||
x1: Any
|
||||
@@ -109,7 +111,7 @@ class Rect(NamedTuple): # pylint: disable=inherit-non-class
|
||||
|
||||
|
||||
class HocrTransformError(Exception):
|
||||
pass
|
||||
"""Error while applying hOCR transform."""
|
||||
|
||||
|
||||
class HocrTransform:
|
||||
@@ -287,7 +289,7 @@ class HocrTransform:
|
||||
continue
|
||||
|
||||
pxl_coords = self.element_coordinates(elem)
|
||||
pt = self.pt_from_pixel(pxl_coords)
|
||||
pt = self.pt_from_pixel(pxl_coords) # pylint: disable=invalid-name
|
||||
|
||||
# draw the bbox border
|
||||
if show_bounding_boxes: # pragma: no cover
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""Post-processing image optimization of OCR PDFs."""
|
||||
|
||||
|
||||
import logging
|
||||
import sys
|
||||
@@ -55,7 +57,9 @@ DEFAULT_PNG_QUALITY = 70
|
||||
Xref = NewType('Xref', int)
|
||||
|
||||
|
||||
class XrefExt(NamedTuple): # pylint: disable=inherit-non-class
|
||||
class XrefExt(NamedTuple):
|
||||
"""A PDF xref and image extension pair."""
|
||||
|
||||
xref: Xref
|
||||
ext: str
|
||||
|
||||
@@ -466,7 +470,7 @@ def _find_deflatable_jpeg(
|
||||
result = extract_image_filter(pike, root, image, xref)
|
||||
if result is None:
|
||||
return None
|
||||
pim, filtdp = result
|
||||
_pim, filtdp = result
|
||||
|
||||
if filtdp[0] == Name.DCTDecode and not filtdp[1] and options.optimize >= 1:
|
||||
return XrefExt(xref, '.memory')
|
||||
@@ -707,9 +711,9 @@ def main(infile, outfile, level, jobs=1):
|
||||
jb2lossy=False,
|
||||
)
|
||||
|
||||
with TemporaryDirectory() as td:
|
||||
context = PdfContext(options, td, infile, None, None)
|
||||
tmpout = Path(td) / 'out.pdf'
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
context = PdfContext(options, tmpdir, infile, None, None)
|
||||
tmpout = Path(tmpdir) / 'out.pdf'
|
||||
optimize(
|
||||
infile,
|
||||
tmpout,
|
||||
|
||||
@@ -97,7 +97,8 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
|
||||
target_filename: filename to save
|
||||
icc: ICC identifier such as 'sRGB'
|
||||
References:
|
||||
Adobe PDFMARK Reference: https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdfmark_reference.pdf
|
||||
Adobe PDFMARK Reference:
|
||||
https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/pdfmark_reference.pdf
|
||||
"""
|
||||
if icc != 'sRGB':
|
||||
raise NotImplementedError("Only supporting sRGB")
|
||||
@@ -105,11 +106,11 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
|
||||
bytes_icc_profile = (
|
||||
package_files('ocrmypdf.data') / SRGB_ICC_PROFILE_NAME
|
||||
).read_bytes()
|
||||
ps = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3))
|
||||
postscript = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3))
|
||||
|
||||
# We should have encoded everything to pure ASCII by this point, and
|
||||
# to be safe, only allow ASCII in PostScript
|
||||
Path(target_filename).write_text(ps, encoding='ascii')
|
||||
Path(target_filename).write_text(postscript, encoding='ascii')
|
||||
return target_filename
|
||||
|
||||
|
||||
|
||||
@@ -6,4 +6,6 @@
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""For extracting information about PDFs prior to OCR."""
|
||||
|
||||
from ocrmypdf.pdfinfo.info import Colorspace, Encoding, PdfInfo
|
||||
|
||||
@@ -6,13 +6,15 @@
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Extract information about the content of a PDF."""
|
||||
|
||||
import atexit
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from contextlib import ExitStack
|
||||
from decimal import Decimal
|
||||
from enum import Enum
|
||||
from enum import Enum, auto
|
||||
from functools import partial
|
||||
from math import hypot, inf, isclose
|
||||
from os import PathLike
|
||||
@@ -20,6 +22,7 @@ from pathlib import Path
|
||||
from typing import (
|
||||
Container,
|
||||
Dict,
|
||||
Iterable,
|
||||
Iterator,
|
||||
List,
|
||||
Mapping,
|
||||
@@ -48,11 +51,39 @@ from ocrmypdf.pdfinfo.layout import get_page_analysis, get_text_boxes
|
||||
|
||||
logger = logging.getLogger()
|
||||
|
||||
Colorspace = Enum('Colorspace', 'gray rgb cmyk lab icc index sep devn pattern jpeg2000')
|
||||
|
||||
Encoding = Enum(
|
||||
'Encoding', 'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate runlength'
|
||||
)
|
||||
class Colorspace(Enum):
|
||||
"""Description of common image colorspaces in a PDF."""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
gray = auto()
|
||||
rgb = auto()
|
||||
cmyk = auto()
|
||||
lab = auto()
|
||||
icc = auto()
|
||||
index = auto()
|
||||
sep = auto()
|
||||
devn = auto()
|
||||
pattern = auto()
|
||||
jpeg2000 = auto()
|
||||
|
||||
|
||||
class Encoding(Enum):
|
||||
"""Description of common image encodings in a PDF."""
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
ccitt = auto()
|
||||
jpeg = auto()
|
||||
jpeg2000 = auto()
|
||||
jbig2 = auto()
|
||||
asciihex = auto()
|
||||
ascii85 = auto()
|
||||
lzw = auto()
|
||||
flate = auto()
|
||||
runlength = auto()
|
||||
|
||||
|
||||
FloatRect = Tuple[float, float, float, float]
|
||||
|
||||
FRIENDLY_COLORSPACE: Dict[str, Colorspace] = {
|
||||
'/DeviceGray': Colorspace.gray,
|
||||
@@ -105,18 +136,24 @@ def _is_unit_square(shorthand):
|
||||
|
||||
|
||||
class XobjectSettings(NamedTuple):
|
||||
"""Info about an XObject found in a PDF."""
|
||||
|
||||
name: str
|
||||
shorthand: Tuple[float, float, float, float, float, float]
|
||||
stack_depth: int
|
||||
|
||||
|
||||
class InlineSettings(NamedTuple):
|
||||
"""Info about an inline image found in a PDF."""
|
||||
|
||||
iimage: PdfInlineImage
|
||||
shorthand: Tuple[float, float, float, float, float, float]
|
||||
stack_depth: int
|
||||
|
||||
|
||||
class ContentsInfo(NamedTuple):
|
||||
"""Info about various objects found in a PDF."""
|
||||
|
||||
xobject_settings: List[XobjectSettings]
|
||||
inline_images: List[InlineSettings]
|
||||
found_vector: bool
|
||||
@@ -125,17 +162,19 @@ class ContentsInfo(NamedTuple):
|
||||
|
||||
|
||||
class TextboxInfo(NamedTuple):
|
||||
"""Info about a text box found in a PDF."""
|
||||
|
||||
bbox: Tuple[float, float, float, float]
|
||||
is_visible: bool
|
||||
is_corrupt: bool
|
||||
|
||||
|
||||
class VectorMarker:
|
||||
pass
|
||||
"""Sentinel indicating vector drawing operations were found on a page."""
|
||||
|
||||
|
||||
class TextMarker:
|
||||
pass
|
||||
"""Sentinel indicating text drawing operations were found on a page."""
|
||||
|
||||
|
||||
def _normalize_stack(graphobjs):
|
||||
@@ -197,7 +236,7 @@ def _interpret_contents(contentstream: Object, initial_shorthand=UNIT_SQUARE):
|
||||
if len(stack) > 32: # See docstring
|
||||
if len(stack) > 128:
|
||||
raise RuntimeError(
|
||||
"PDF graphics stack overflowed hard limit, operator %i" % n
|
||||
f"PDF graphics stack overflowed hard limit at operator {n}"
|
||||
)
|
||||
warn("PDF graphics stack overflowed spec limit")
|
||||
elif operator == 'Q':
|
||||
@@ -283,7 +322,7 @@ def _get_dpi(ctm_shorthand, image_size) -> Resolution:
|
||||
|
||||
"""
|
||||
|
||||
a, b, c, d, _, _ = ctm_shorthand
|
||||
a, b, c, d, _, _ = ctm_shorthand # pylint: disable=invalid-name
|
||||
|
||||
# Calculate the width and height of the image in PDF units
|
||||
image_drawn = hypot(a, b), hypot(c, d)
|
||||
@@ -299,6 +338,8 @@ def _get_dpi(ctm_shorthand, image_size) -> Resolution:
|
||||
|
||||
|
||||
class ImageInfo:
|
||||
"""Information about an image found in a PDF."""
|
||||
|
||||
DPI_PREC = Decimal('1.000')
|
||||
|
||||
_comp: Optional[int]
|
||||
@@ -428,7 +469,7 @@ def _find_inline_images(contentsinfo: ContentsInfo) -> Iterator[ImageInfo]:
|
||||
|
||||
for n, inline in enumerate(contentsinfo.inline_images):
|
||||
yield ImageInfo(
|
||||
name='inline-%02d' % n, shorthand=inline.shorthand, inline=inline.iimage
|
||||
name=f'inline-{n:02d}', shorthand=inline.shorthand, inline=inline.iimage
|
||||
)
|
||||
|
||||
|
||||
@@ -569,10 +610,10 @@ def _process_content_streams(
|
||||
yield from _find_form_xobject_images(pdf, container, contentsinfo)
|
||||
|
||||
|
||||
def _page_has_text(text_blocks, page_width, page_height) -> bool:
|
||||
def _page_has_text(text_blocks: Iterable[FloatRect], page_width, page_height) -> bool:
|
||||
"""Smarter text detection that ignores text in margins"""
|
||||
|
||||
pw, ph = float(page_width), float(page_height)
|
||||
pw, ph = float(page_width), float(page_height) # pylint: disable=invalid-name
|
||||
|
||||
margin_ratio = 0.125
|
||||
interior_bbox = (
|
||||
@@ -582,7 +623,7 @@ def _page_has_text(text_blocks, page_width, page_height) -> bool:
|
||||
margin_ratio * ph, # bottom (first quadrant: bottom < top)
|
||||
)
|
||||
|
||||
def rects_intersect(a, b) -> bool:
|
||||
def rects_intersect(a: FloatRect, b: FloatRect) -> bool:
|
||||
"""
|
||||
Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)
|
||||
https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other
|
||||
@@ -604,19 +645,19 @@ def simplify_textboxes(miner, textbox_getter) -> Iterator[TextboxInfo]:
|
||||
We do this to save memory and ensure that our objects are pickleable.
|
||||
"""
|
||||
for box in textbox_getter(miner):
|
||||
first_line = box._objs[0]
|
||||
first_char = first_line._objs[0]
|
||||
first_line = box._objs[0] # pylint: disable=protected-access
|
||||
first_char = first_line._objs[0] # pylint: disable=protected-access
|
||||
|
||||
visible = first_char.rendermode != 3
|
||||
corrupt = first_char.get_text() == '\ufffd'
|
||||
yield TextboxInfo(box.bbox, visible, corrupt)
|
||||
|
||||
|
||||
worker_pdf = None
|
||||
worker_pdf = None # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def _pdf_pageinfo_sync_init(pdf: Pdf, infile: Path, pdfminer_loglevel):
|
||||
global worker_pdf # pylint: disable=global-statement
|
||||
global worker_pdf # pylint: disable=global-statement,invalid-name
|
||||
pikepdf_enable_mmap()
|
||||
|
||||
logging.getLogger('pdfminer').setLevel(pdfminer_loglevel)
|
||||
@@ -701,6 +742,8 @@ def _pdf_pageinfo_concurrent(
|
||||
|
||||
|
||||
class PageInfo:
|
||||
"""Information about type of contents on each page in a PDF."""
|
||||
|
||||
_has_text: Optional[bool]
|
||||
_has_vector: Optional[bool]
|
||||
_images: List[ImageInfo]
|
||||
@@ -762,15 +805,15 @@ class PageInfo:
|
||||
self._has_vector = False
|
||||
self._has_text = False
|
||||
self._images = []
|
||||
for ci in _process_content_streams(
|
||||
for info in _process_content_streams(
|
||||
pdf=pdf, container=page, shorthand=userunit_shorthand
|
||||
):
|
||||
if isinstance(ci, VectorMarker):
|
||||
if isinstance(info, VectorMarker):
|
||||
self._has_vector = True
|
||||
elif isinstance(ci, TextMarker):
|
||||
elif isinstance(info, TextMarker):
|
||||
self._has_text = True
|
||||
elif isinstance(ci, ImageInfo):
|
||||
self._images.append(ci)
|
||||
elif isinstance(info, ImageInfo):
|
||||
self._images.append(info)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
else:
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
"""OCRmyPDF pluggy plugin specification."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from argparse import ArgumentParser, Namespace
|
||||
|
||||
@@ -28,7 +28,12 @@ log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run(
|
||||
args, *, env=None, logs_errors_to_stdout: bool = False, **kwargs
|
||||
args,
|
||||
*,
|
||||
env=None,
|
||||
logs_errors_to_stdout: bool = False,
|
||||
check: bool = False,
|
||||
**kwargs,
|
||||
) -> CompletedProcess:
|
||||
"""Wrapper around :py:func:`subprocess.run`
|
||||
|
||||
@@ -50,7 +55,7 @@ def run(
|
||||
stderr = None
|
||||
stderr_name = 'stderr' if not logs_errors_to_stdout else 'stdout'
|
||||
try:
|
||||
proc = subprocess_run(args, env=env, **kwargs)
|
||||
proc = subprocess_run(args, env=env, check=check, **kwargs)
|
||||
except CalledProcessError as e:
|
||||
stderr = getattr(e, stderr_name, None)
|
||||
raise
|
||||
@@ -111,6 +116,7 @@ def _fix_process_args(args, env, kwargs):
|
||||
program = str(args[0])
|
||||
|
||||
if os.name == 'nt':
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from ocrmypdf.subprocess._windows import fix_windows_args
|
||||
|
||||
args = fix_windows_args(program, args, env)
|
||||
@@ -171,42 +177,42 @@ def get_version(
|
||||
return version
|
||||
|
||||
|
||||
missing_program = '''
|
||||
MISSING_PROGRAM = '''
|
||||
The program '{program}' could not be executed or was not found on your
|
||||
system PATH.
|
||||
'''
|
||||
|
||||
missing_optional_program = '''
|
||||
MISSING_OPTIONAL_PROGRAM = '''
|
||||
The program '{program}' could not be executed or was not found on your
|
||||
system PATH. This program is required when you use the
|
||||
{required_for} arguments. You could try omitting these arguments, or install
|
||||
the package.
|
||||
'''
|
||||
|
||||
missing_recommend_program = '''
|
||||
MISSING_RECOMMEND_PROGRAM = '''
|
||||
The program '{program}' could not be executed or was not found on your
|
||||
system PATH. This program is recommended when using the {required_for} arguments,
|
||||
but not required, so we will proceed. For best results, install the program.
|
||||
'''
|
||||
|
||||
old_version = '''
|
||||
OLD_VERSION = '''
|
||||
OCRmyPDF requires '{program}' {need_version} or higher. Your system appears
|
||||
to have {found_version}. Please update this program.
|
||||
'''
|
||||
|
||||
old_version_required_for = '''
|
||||
OLD_VERSION_REQUIRED_FOR = '''
|
||||
OCRmyPDF requires '{program}' {need_version} or higher when run with the
|
||||
{required_for} arguments. If you omit these arguments, OCRmyPDF may be able to
|
||||
proceed. For best results, install the program.
|
||||
'''
|
||||
|
||||
osx_install_advice = '''
|
||||
OSX_INSTALL_ADVICE = '''
|
||||
If you have homebrew installed, try these command to install the missing
|
||||
package:
|
||||
brew install {package}
|
||||
'''
|
||||
|
||||
linux_install_advice = '''
|
||||
LINUX_INSTALL_ADVICE = '''
|
||||
On systems with the aptitude package manager (Debian, Ubuntu), try these
|
||||
commands:
|
||||
sudo apt-get update
|
||||
@@ -216,7 +222,7 @@ On RPM-based systems (Red Hat, Fedora), search for instructions on
|
||||
installing the RPM for {program}.
|
||||
'''
|
||||
|
||||
windows_install_advice = '''
|
||||
WINDOWS_INSTALL_ADVICE = '''
|
||||
If not already installed, install the Chocolatey package manager. Then use
|
||||
a command prompt to install the missing package:
|
||||
choco install {package}
|
||||
@@ -234,32 +240,35 @@ def _get_platform():
|
||||
|
||||
|
||||
def _error_trailer(program, package, **kwargs):
|
||||
del kwargs
|
||||
if isinstance(package, Mapping):
|
||||
package = package.get(_get_platform(), program)
|
||||
|
||||
if _get_platform() == 'darwin':
|
||||
log.info(osx_install_advice.format(**locals()))
|
||||
log.info(OSX_INSTALL_ADVICE.format(**locals()))
|
||||
elif _get_platform() == 'linux':
|
||||
log.info(linux_install_advice.format(**locals()))
|
||||
log.info(LINUX_INSTALL_ADVICE.format(**locals()))
|
||||
elif _get_platform() == 'windows':
|
||||
log.info(windows_install_advice.format(**locals()))
|
||||
log.info(WINDOWS_INSTALL_ADVICE.format(**locals()))
|
||||
|
||||
|
||||
def _error_missing_program(program, package, required_for, recommended):
|
||||
# pylint: disable=unused-argument
|
||||
if recommended:
|
||||
log.warning(missing_recommend_program.format(**locals()))
|
||||
log.warning(MISSING_RECOMMEND_PROGRAM.format(**locals()))
|
||||
elif required_for:
|
||||
log.error(missing_optional_program.format(**locals()))
|
||||
log.error(MISSING_OPTIONAL_PROGRAM.format(**locals()))
|
||||
else:
|
||||
log.error(missing_program.format(**locals()))
|
||||
log.error(MISSING_PROGRAM.format(**locals()))
|
||||
_error_trailer(**locals())
|
||||
|
||||
|
||||
def _error_old_version(program, package, need_version, found_version, required_for):
|
||||
# pylint: disable=unused-argument
|
||||
if required_for:
|
||||
log.error(old_version_required_for.format(**locals()))
|
||||
log.error(OLD_VERSION_REQUIRED_FOR.format(**locals()))
|
||||
else:
|
||||
log.error(old_version.format(**locals()))
|
||||
log.error(OLD_VERSION.format(**locals()))
|
||||
_error_trailer(**locals())
|
||||
|
||||
|
||||
@@ -294,10 +303,15 @@ def check_external_program(
|
||||
found_version = version_checker()
|
||||
else: # deprecated
|
||||
found_version = version_checker
|
||||
except (CalledProcessError, FileNotFoundError, MissingDependencyError):
|
||||
except (CalledProcessError, FileNotFoundError) as e:
|
||||
_error_missing_program(program, package, required_for, recommended)
|
||||
if not recommended:
|
||||
raise MissingDependencyError(program)
|
||||
raise MissingDependencyError(program) from e
|
||||
return
|
||||
except MissingDependencyError:
|
||||
_error_missing_program(program, package, required_for, recommended)
|
||||
if not recommended:
|
||||
raise
|
||||
return
|
||||
|
||||
def remove_leading_v(s):
|
||||
|
||||
@@ -7,6 +7,8 @@
|
||||
# type: ignore
|
||||
# Non-Windows mypy now breaks when trying to typecheck winreg
|
||||
|
||||
"""Find Tesseract and Ghostscript binaries on Windows using the registry."""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
@@ -17,9 +19,9 @@ from typing import Any, Callable, Iterable, Iterator, Set, Tuple, TypeVar
|
||||
|
||||
try:
|
||||
import winreg
|
||||
except ModuleNotFoundError as e:
|
||||
raise ModuleNotFoundError("This module is for Windows only") from e
|
||||
|
||||
except ModuleNotFoundError as _notfound_ex:
|
||||
raise ModuleNotFoundError("This module is for Windows only") from _notfound_ex
|
||||
del _notfound_ex
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@@ -40,15 +42,15 @@ def ghostscript_version_key(s: str) -> Tuple[int, int, int]:
|
||||
def registry_enum(
|
||||
key: winreg.HKEYType, enum_fn: Callable[[winreg.HKEYType, int], T]
|
||||
) -> Iterator[T]:
|
||||
LIMIT = 999
|
||||
limit = 999
|
||||
n = 0
|
||||
while n < LIMIT:
|
||||
while n < limit:
|
||||
try:
|
||||
yield enum_fn(key, n)
|
||||
n += 1
|
||||
except OSError:
|
||||
break
|
||||
if n == LIMIT:
|
||||
if n == limit:
|
||||
raise ValueError(f"Too many registry keys under {key}")
|
||||
|
||||
|
||||
@@ -61,6 +63,7 @@ def registry_values(key: winreg.HKEYType) -> Iterator[Tuple[str, Any, int]]:
|
||||
|
||||
|
||||
def registry_path_ghostscript(env=None) -> Iterator[Path]:
|
||||
del env # unused (but needed for protocol)
|
||||
try:
|
||||
with winreg.OpenKey(
|
||||
winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Artifex\GPL Ghostscript"
|
||||
@@ -78,6 +81,7 @@ def registry_path_ghostscript(env=None) -> Iterator[Path]:
|
||||
|
||||
|
||||
def registry_path_tesseract(env=None) -> Iterator[Path]:
|
||||
del env # unused (but needed for protocol)
|
||||
try:
|
||||
with winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, r"SOFTWARE\Tesseract-OCR") as k:
|
||||
for subkey, val, _valtype in registry_values(k):
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
import datetime
|
||||
import warnings
|
||||
from datetime import timezone
|
||||
from os import fspath
|
||||
from shutil import copyfile
|
||||
|
||||
import pikepdf
|
||||
@@ -326,6 +325,9 @@ def test_metadata_fixup_warning(resources, outdir, caplog):
|
||||
assert any(record.levelname == 'WARNING' for record in caplog.records)
|
||||
|
||||
|
||||
XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
|
||||
|
||||
|
||||
def test_prevent_gs_invalid_xml(resources, outdir):
|
||||
generate_pdfa_ps(outdir / 'pdfa.ps')
|
||||
copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')
|
||||
@@ -352,7 +354,7 @@ def test_prevent_gs_invalid_xml(resources, outdir):
|
||||
contents = (outdir / 'pdfa.pdf').read_bytes()
|
||||
# Since the XML may be invalid, we scan instead of actually feeding it
|
||||
# to a parser.
|
||||
XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
|
||||
|
||||
xmp_start = contents.find(XMP_MAGIC)
|
||||
xmp_end = contents.rfind(b'<?xpacket end', xmp_start)
|
||||
assert 0 < xmp_start < xmp_end
|
||||
|
||||
Reference in New Issue
Block a user