From 51defa6d662855e2bb34ff053d4c69ca76783706 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 11 Oct 2017 12:58:35 -0700 Subject: [PATCH] Import cleanup and some pylint fixes --- ocrmypdf/__main__.py | 63 +++++++++++++++++++----------------- ocrmypdf/exceptions.py | 2 +- ocrmypdf/exec/__init__.py | 1 + ocrmypdf/exec/ghostscript.py | 2 +- ocrmypdf/exec/qpdf.py | 4 +-- ocrmypdf/exec/tesseract.py | 22 ++++++------- ocrmypdf/exec/unpaper.py | 6 ++-- ocrmypdf/pipeline.py | 31 +++++++----------- 8 files changed, 64 insertions(+), 67 deletions(-) diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py index 5784fb6e..4d671ee5 100755 --- a/ocrmypdf/__main__.py +++ b/ocrmypdf/__main__.py @@ -1,29 +1,25 @@ #!/usr/bin/env python3 # © 2015-17 James R. Barlow: github.com/jbarlow83 -from contextlib import suppress from tempfile import mkdtemp from collections.abc import Sequence import sys import os import re -import shutil import warnings import multiprocessing import atexit import textwrap -import img2pdf import logging import argparse import PyPDF2 as pypdf -from PIL import Image import ruffus.ruffus_exceptions as ruffus_exceptions import ruffus.cmdline as cmdline import ruffus.proxy_logger as proxy_logger -from .pipeline import JobContext, JobContextManager, re_symlink, \ +from .pipeline import JobContext, JobContextManager, \ cleanup_working_files, build_pipeline from .pdfa import file_claims_pdfa from .helpers import is_iterable_notstr, re_symlink, is_file_writable @@ -445,9 +441,9 @@ def check_options_advanced(options, log): def check_options_metadata(options, log): import unicodedata - metadata = [options.title, options.author, options.keywords, - options.subject] - for s in (m for m in metadata if m): + docinfo = [options.title, options.author, options.keywords, + options.subject] + for s in (m for m in docinfo if m): for c in s: if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000: raise ValueError( @@ -529,31 +525,31 @@ def cleanup_ruffus_error_message(msg): def do_ruffus_exception(ruffus_five_tuple, options, log): """Replace the elaborate ruffus stack trace with a user friendly description of the error message that occurred.""" + exit_code = None task_name, job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple if exc_name == 'builtins.SystemExit': match = re.search(r"\.(.+?)\)", exc_value) exit_code_name = match.groups()[0] - exit_code = getattr(ExitCode, exit_code_name, 'other_error') - return exit_code + exit_code = getattr(ExitCode, exit_code_name, 'other_error') elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError': log.error(cleanup_ruffus_error_message(exc_value)) - return ExitCode.input_file + exit_code = ExitCode.input_file elif exc_name == 'builtins.TypeError': # Even though repair_pdf will fail, ruffus will still try # to call split_pages with no input files, likely due to a bug if task_name == 'split_pages': log.error("Input file '{0}' is not a valid PDF".format( options.input_file)) - return ExitCode.input_file + exit_code = ExitCode.input_file elif exc_name == 'builtins.KeyboardInterrupt': log.error("Interrupted by user") - return ExitCode.ctrl_c + exit_code = ExitCode.ctrl_c elif exc_name == 'subprocess.CalledProcessError': # It's up to the subprocess handler to report something useful msg = "Error occurred while running this command:" log.error(msg + '\n' + exc_value) - return ExitCode.child_process_error + exit_code = ExitCode.child_process_error elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError': log.error(textwrap.dedent("""\ Failed to merge PDF image layer with OCR layer @@ -564,11 +560,11 @@ def do_ruffus_exception(ruffus_five_tuple, options, log): Try using ocrmypdf --pdf-renderer tesseract [..other args..] """)) - return ExitCode.input_file + exit_code = ExitCode.input_file elif exc_name.startswith('ocrmypdf.exceptions.'): base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '') exc_class = getattr(ocrmypdf_exceptions, base_exc_name) - return exc_class.exit_code + exit_code = exc_class.exit_code elif exc_name == 'PyPDF2.utils.PdfReadError' and \ 'not been decrypted' in exc_value: log.error(textwrap.dedent("""\ @@ -581,7 +577,10 @@ def do_ruffus_exception(ruffus_five_tuple, options, log): (Only algorithms "R = 1" and "R = 2" are supported.) """)) - return ExitCode.encrypted_pdf + exit_code = ExitCode.encrypted_pdf + + if exit_code is not None: + return exit_code if not options.verbose: log.error(exc_stack) @@ -647,6 +646,20 @@ def check_closed_streams(options): return True +def log_page_orientations(pdfinfo, _log): + direction = {0: 'n', 90: 'e', + 180: 's', 270: 'w'} + orientations = [] + for n, page in enumerate(pdfinfo): + angle = pdfinfo[n].rotation or 0 + if angle != 0: + orientations.append('{0}{1}'.format( + n + 1, + direction.get(angle, ''))) + if orientations: + _log.info('Page orientations detected: ' + ' '.join(orientations)) + + def run_pipeline(): options = parser.parse_args() options.verbose_abbreviated_path = 1 @@ -736,8 +749,7 @@ def run_pipeline(): _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) return ExitCode.other_error - else: - return exitcode + return exitcode except ExitCodeException as e: return e.exit_code except Exception as e: @@ -767,17 +779,8 @@ def run_pipeline(): if options.verbose: from pprint import pformat _log.debug(pformat(pdfinfo)) - direction = {0: 'n', 90: 'e', - 180: 's', 270: 'w'} - orientations = [] - for n, page in enumerate(pdfinfo): - angle = pdfinfo[n].rotation or 0 - if angle != 0: - orientations.append('{0}{1}'.format( - n + 1, - direction.get(angle, ''))) - if orientations: - _log.info('Page orientations detected: ' + ' '.join(orientations)) + + log_page_orientations(pdfinfo, _log) return ExitCode.ok diff --git a/ocrmypdf/exceptions.py b/ocrmypdf/exceptions.py index c68bcf9f..8031040f 100644 --- a/ocrmypdf/exceptions.py +++ b/ocrmypdf/exceptions.py @@ -20,7 +20,7 @@ class ExitCode(IntEnum): class ExitCodeException(Exception): - pass + exit_code = ExitCode.other_error class PdfMergeFailedError(ExitCodeException): diff --git a/ocrmypdf/exec/__init__.py b/ocrmypdf/exec/__init__.py index e4a29fa8..9959e87a 100644 --- a/ocrmypdf/exec/__init__.py +++ b/ocrmypdf/exec/__init__.py @@ -7,5 +7,6 @@ import os def get_program(name): + "Check environment variables for overrides to this program" envvar = 'OCRMYPDF_' + name.upper() return os.environ.get(envvar, name) diff --git a/ocrmypdf/exec/ghostscript.py b/ocrmypdf/exec/ghostscript.py index d53b867a..4190f9c4 100644 --- a/ocrmypdf/exec/ghostscript.py +++ b/ocrmypdf/exec/ghostscript.py @@ -7,9 +7,9 @@ from shutil import copy from functools import lru_cache import re import sys +from PIL import Image from . import get_program from ..exceptions import SubprocessOutputError, MissingDependencyError -from PIL import Image from ..helpers import fspath diff --git a/ocrmypdf/exec/qpdf.py b/ocrmypdf/exec/qpdf.py index 2ba8a5b6..d989a7bf 100644 --- a/ocrmypdf/exec/qpdf.py +++ b/ocrmypdf/exec/qpdf.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# © 2015 James R. Barlow: github.com/jbarlow83 +# © 2017 James R. Barlow: github.com/jbarlow83 from subprocess import CalledProcessError, STDOUT, PIPE, run, check_output from functools import lru_cache @@ -134,4 +134,4 @@ def merge(input_files, output_file, min_version=None): input_files[0], '--pages' ] + input_files + ['--', output_file] run(args_qpdf, check=True) - + \ No newline at end of file diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py index 87d655e4..ccea07e5 100644 --- a/ocrmypdf/exec/tesseract.py +++ b/ocrmypdf/exec/tesseract.py @@ -1,21 +1,20 @@ #!/usr/bin/env python3 -# © 2015 James R. Barlow: github.com/jbarlow83 +# © 2017 James R. Barlow: github.com/jbarlow83 import sys import os import re import shutil from functools import lru_cache -from ..exceptions import MissingDependencyError, TesseractConfigError -from ..helpers import page_number -from . import get_program from collections import namedtuple from textwrap import dedent import PyPDF2 as pypdf +from subprocess import PIPE, CalledProcessError, \ + TimeoutExpired, check_output, STDOUT -from subprocess import Popen, PIPE, CalledProcessError, \ - TimeoutExpired, check_output, STDOUT, DEVNULL - +from ..exceptions import MissingDependencyError, TesseractConfigError +from ..helpers import page_number +from . import get_program OrientationConfidence = namedtuple( 'OrientationConfidence', @@ -60,7 +59,7 @@ def version(): def v4(): "Is this Tesseract v4.0?" - return (version() >= '4') + return version() >= '4' @lru_cache(maxsize=1) @@ -74,6 +73,7 @@ def has_textonly_pdf(): get_program('tesseract'), '--print-parameters' ] + params = '' try: params = check_output( args_tess, close_fds=True, universal_newlines=True, @@ -113,12 +113,12 @@ def languages(): return set(lang.strip() for lang in langs.splitlines()[1:]) -def tess_base_args(languages, engine_mode): +def tess_base_args(langs, engine_mode): args = [ get_program('tesseract'), ] - if languages: - args.extend(['-l', '+'.join(languages)]) + if langs: + args.extend(['-l', '+'.join(langs)]) if engine_mode is not None and v4(): args.extend(['--oem', str(engine_mode)]) return args diff --git a/ocrmypdf/exec/unpaper.py b/ocrmypdf/exec/unpaper.py index 57f9eb08..e1ceade8 100644 --- a/ocrmypdf/exec/unpaper.py +++ b/ocrmypdf/exec/unpaper.py @@ -3,7 +3,7 @@ # unpaper documentation: # https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md -from subprocess import CalledProcessError, STDOUT, check_output, check_call +from subprocess import CalledProcessError, STDOUT, check_output from tempfile import NamedTemporaryFile import sys import os @@ -18,10 +18,10 @@ def version(): get_program('unpaper'), '--version' ] - version = check_output( + ver = check_output( args_unpaper, close_fds=True, universal_newlines=True, stderr=STDOUT, timeout=5) - return version.strip() + return ver.strip() try: diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index ff58da2f..487640c2 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -2,19 +2,10 @@ # © 2016 James R. Barlow: github.com/jbarlow83 from contextlib import suppress -from tempfile import mkdtemp -from functools import partial import sys import os -import re import shutil -import warnings -import multiprocessing -import atexit -import textwrap import img2pdf -import logging -import argparse import PyPDF2 as pypdf from PIL import Image @@ -23,7 +14,7 @@ from ruffus import formatter, regex, Pipeline, suffix from .hocrtransform import HocrTransform from .pdfinfo import PdfInfo, Encoding, Colorspace -from .pdfa import generate_pdfa_ps, file_claims_pdfa +from .pdfa import generate_pdfa_ps from .helpers import re_symlink, is_iterable_notstr, page_number from .exec import ghostscript, tesseract, qpdf from .exceptions import * @@ -52,6 +43,7 @@ class JobContext: def __init__(self): self.pdfinfo = None self.options = None + self.work_folder = None def generate_pdfinfo(self, infile): self.pdfinfo = PdfInfo(infile) @@ -190,7 +182,8 @@ def repair_pdf( pdfinfo = PdfInfo(output_file) if pdfinfo.has_userunit and options.output_type == 'pdfa': - log.error("This input file uses a PDF feature that is not supported " + log.error( + "This input file uses a PDF feature that is not supported " "by Ghostscript, so you cannot use --output-type=pdfa for this " "file. (Specifically, it uses the PDF-1.6 /UserUnit feature to " "support very large or small page sizes, and Ghostscript cannot " @@ -827,9 +820,9 @@ def get_pdfmark(base_pdf, options): renderer_tag = 'OCR' pdfmark['/Creator'] = '{0} {1} / Tesseract {2} {3}'.format( - PROGRAM_NAME, VERSION, - renderer_tag, - tesseract.version()) + PROGRAM_NAME, VERSION, + renderer_tag, + tesseract.version()) return pdfmark @@ -937,7 +930,6 @@ def merge_sidecars( output_file, log, context): - options = context.get_options() pdfinfo = context.get_pdfinfo() txt_files = [None] * len(pdfinfo) @@ -948,8 +940,8 @@ def merge_sidecars( txt_files[idx] = infile def write_pages(stream): - for page_number, txt_file in enumerate(txt_files): - if page_number != 0: + for page_num, txt_file in enumerate(txt_files): + if page_num != 0: stream.write('\f') # Form feed between pages if txt_file: with open(txt_file, 'r') as in_: @@ -965,7 +957,7 @@ def merge_sidecars( stream.write(in_.read()) else: stream.write('[OCR skipped on page {}]'.format( - page_number + 1)) + page_num + 1)) if output_file == '-': write_pages(sys.stdout) @@ -1149,7 +1141,8 @@ def build_pipeline(options, work_folder, log, context): output=os.path.join(work_folder, r'\1.rendered.pdf'), extras=[log, context]) task_combine_layers.graphviz(fillcolor='"#00cc66"') - task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich') + task_combine_layers.active_if(options.pdf_renderer == 'hocr' or + options.pdf_renderer == 'sandwich') # Tesseract OCR+PDF task_ocr_tesseract_and_render_pdf = main_pipeline.collate(