Import cleanup and some pylint fixes

2026-05-19 20:14:53 -04:00 · 2017-10-11 12:58:35 -07:00
parent 7d73098d6e
commit 51defa6d66
8 changed files with 64 additions and 67 deletions
--- a/ocrmypdf/main.py
+++ b/ocrmypdf/main.py
@@ -1,29 +1,25 @@
 #!/usr/bin/env python3
 # © 2015-17 James R. Barlow: github.com/jbarlow83

-from contextlib import suppress
 from tempfile import mkdtemp
 from collections.abc import Sequence
 import sys
 import os
 import re
-import shutil
 import warnings
 import multiprocessing
 import atexit
 import textwrap
-import img2pdf
 import logging
 import argparse

 import PyPDF2 as pypdf
-from PIL import Image

 import ruffus.ruffus_exceptions as ruffus_exceptions
 import ruffus.cmdline as cmdline
 import ruffus.proxy_logger as proxy_logger

-from .pipeline import JobContext, JobContextManager, re_symlink, \
+from .pipeline import JobContext, JobContextManager, \
    cleanup_working_files, build_pipeline
 from .pdfa import file_claims_pdfa
 from .helpers import is_iterable_notstr, re_symlink, is_file_writable
@@ -445,9 +441,9 @@ def check_options_advanced(options, log):

 def check_options_metadata(options, log):
    import unicodedata
-    metadata = [options.title, options.author, options.keywords,
-                options.subject]
-    for s in (m for m in metadata if m):
+    docinfo = [options.title, options.author, options.keywords,
+               options.subject]
+    for s in (m for m in docinfo if m):
        for c in s:
            if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
                raise ValueError(
@@ -529,31 +525,31 @@ def cleanup_ruffus_error_message(msg):
 def do_ruffus_exception(ruffus_five_tuple, options, log):
    """Replace the elaborate ruffus stack trace with a user friendly
    description of the error message that occurred."""
+    exit_code = None

    task_name, job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
    if exc_name == 'builtins.SystemExit':
        match = re.search(r"\.(.+?)\)", exc_value)
        exit_code_name = match.groups()[0]
-        exit_code = getattr(ExitCode, exit_code_name, 'other_error')
-        return exit_code
+        exit_code = getattr(ExitCode, exit_code_name, 'other_error')        
    elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
        log.error(cleanup_ruffus_error_message(exc_value))
-        return ExitCode.input_file
+        exit_code = ExitCode.input_file
    elif exc_name == 'builtins.TypeError':
        # Even though repair_pdf will fail, ruffus will still try
        # to call split_pages with no input files, likely due to a bug
        if task_name == 'split_pages':
            log.error("Input file '{0}' is not a valid PDF".format(
                options.input_file))
-            return ExitCode.input_file
+            exit_code = ExitCode.input_file
    elif exc_name == 'builtins.KeyboardInterrupt':
        log.error("Interrupted by user")
-        return ExitCode.ctrl_c
+        exit_code = ExitCode.ctrl_c
    elif exc_name == 'subprocess.CalledProcessError':
        # It's up to the subprocess handler to report something useful
        msg = "Error occurred while running this command:"
        log.error(msg + '\n' + exc_value)
-        return ExitCode.child_process_error
+        exit_code = ExitCode.child_process_error
    elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError':
        log.error(textwrap.dedent("""\
            Failed to merge PDF image layer with OCR layer
@@ -564,11 +560,11 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
            Try using
                ocrmypdf --pdf-renderer tesseract  [..other args..]
            """))
-        return ExitCode.input_file
+        exit_code = ExitCode.input_file
    elif exc_name.startswith('ocrmypdf.exceptions.'):
        base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
        exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
-        return exc_class.exit_code
+        exit_code = exc_class.exit_code
    elif exc_name == 'PyPDF2.utils.PdfReadError' and \
            'not been decrypted' in exc_value:
        log.error(textwrap.dedent("""\
@@ -581,7 +577,10 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
            (Only algorithms "R = 1" and "R = 2" are supported.)

            """))
-        return ExitCode.encrypted_pdf
+        exit_code = ExitCode.encrypted_pdf
+
+    if exit_code is not None:
+        return exit_code

    if not options.verbose:
        log.error(exc_stack)
@@ -647,6 +646,20 @@ def check_closed_streams(options):
    return True


+def log_page_orientations(pdfinfo, _log):
+    direction = {0: 'n', 90: 'e',
+                180: 's', 270: 'w'}
+    orientations = []
+    for n, page in enumerate(pdfinfo):
+        angle = pdfinfo[n].rotation or 0
+        if angle != 0:
+            orientations.append('{0}{1}'.format(
+                n + 1,
+                direction.get(angle, '')))
+    if orientations:
+        _log.info('Page orientations detected: ' + ' '.join(orientations))
+
+
 def run_pipeline():
    options = parser.parse_args()
    options.verbose_abbreviated_path = 1
@@ -736,8 +749,7 @@ def run_pipeline():
            _log.error("Unexpected ruffus exception: " + str(e))
            _log.error(repr(e))
            return ExitCode.other_error
-        else:
-            return exitcode
+        return exitcode
    except ExitCodeException as e:
        return e.exit_code
    except Exception as e:
@@ -767,17 +779,8 @@ def run_pipeline():
    if options.verbose:
        from pprint import pformat
        _log.debug(pformat(pdfinfo))
-    direction = {0: 'n', 90: 'e',
-                 180: 's', 270: 'w'}
-    orientations = []
-    for n, page in enumerate(pdfinfo):
-        angle = pdfinfo[n].rotation or 0
-        if angle != 0:
-            orientations.append('{0}{1}'.format(
-                n + 1,
-                direction.get(angle, '')))
-    if orientations:
-        _log.info('Page orientations detected: ' + ' '.join(orientations))
+
+    log_page_orientations(pdfinfo, _log)

    return ExitCode.ok

--- a/ocrmypdf/exceptions.py
+++ b/ocrmypdf/exceptions.py
@@ -20,7 +20,7 @@ class ExitCode(IntEnum):


 class ExitCodeException(Exception):
-    pass
+    exit_code = ExitCode.other_error


 class PdfMergeFailedError(ExitCodeException):
--- a/ocrmypdf/exec/init.py
+++ b/ocrmypdf/exec/init.py
@@ -7,5 +7,6 @@ import os


 def get_program(name):
+    "Check environment variables for overrides to this program"
    envvar = 'OCRMYPDF_' + name.upper()
    return os.environ.get(envvar, name)
--- a/ocrmypdf/exec/ghostscript.py
+++ b/ocrmypdf/exec/ghostscript.py
@@ -7,9 +7,9 @@ from shutil import copy
 from functools import lru_cache
 import re
 import sys
+from PIL import Image
 from . import get_program
 from ..exceptions import SubprocessOutputError, MissingDependencyError
-from PIL import Image
 from ..helpers import fspath


--- a/ocrmypdf/exec/qpdf.py
+++ b/ocrmypdf/exec/qpdf.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# © 2015 James R. Barlow: github.com/jbarlow83
+# © 2017 James R. Barlow: github.com/jbarlow83

 from subprocess import CalledProcessError, STDOUT, PIPE, run, check_output
 from functools import lru_cache
@@ -134,4 +134,4 @@ def merge(input_files, output_file, min_version=None):
        input_files[0], '--pages'
    ] + input_files + ['--', output_file]
    run(args_qpdf, check=True)
-
+    
--- a/ocrmypdf/exec/tesseract.py
+++ b/ocrmypdf/exec/tesseract.py
@@ -1,21 +1,20 @@
 #!/usr/bin/env python3
-# © 2015 James R. Barlow: github.com/jbarlow83
+# © 2017 James R. Barlow: github.com/jbarlow83

 import sys
 import os
 import re
 import shutil
 from functools import lru_cache
-from ..exceptions import MissingDependencyError, TesseractConfigError
-from ..helpers import page_number
-from . import get_program
 from collections import namedtuple
 from textwrap import dedent
 import PyPDF2 as pypdf
+from subprocess import PIPE, CalledProcessError, \
+    TimeoutExpired, check_output, STDOUT

-from subprocess import Popen, PIPE, CalledProcessError, \
-    TimeoutExpired, check_output, STDOUT, DEVNULL
-
+from ..exceptions import MissingDependencyError, TesseractConfigError
+from ..helpers import page_number
+from . import get_program

 OrientationConfidence = namedtuple(
    'OrientationConfidence',
@@ -60,7 +59,7 @@ def version():

 def v4():
    "Is this Tesseract v4.0?"
-    return (version() >= '4')
+    return version() >= '4'


@lru_cache(maxsize=1)
@@ -74,6 +73,7 @@ def has_textonly_pdf():
        get_program('tesseract'),
        '--print-parameters'
    ]
+    params = ''
    try:
        params = check_output(
                args_tess, close_fds=True, universal_newlines=True,
@@ -113,12 +113,12 @@ def languages():
    return set(lang.strip() for lang in langs.splitlines()[1:])


-def tess_base_args(languages, engine_mode):
+def tess_base_args(langs, engine_mode):
    args = [
        get_program('tesseract'),
    ]
-    if languages:
-        args.extend(['-l', '+'.join(languages)])
+    if langs:
+        args.extend(['-l', '+'.join(langs)])
    if engine_mode is not None and v4():
        args.extend(['--oem', str(engine_mode)])
    return args
--- a/ocrmypdf/exec/unpaper.py
+++ b/ocrmypdf/exec/unpaper.py
@@ -3,7 +3,7 @@
 # unpaper documentation:
 # https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md

-from subprocess import CalledProcessError, STDOUT, check_output, check_call
+from subprocess import CalledProcessError, STDOUT, check_output
 from tempfile import NamedTemporaryFile
 import sys
 import os
@@ -18,10 +18,10 @@ def version():
        get_program('unpaper'),
        '--version'
    ]
-    version = check_output(
+    ver = check_output(
        args_unpaper, close_fds=True, universal_newlines=True,
        stderr=STDOUT, timeout=5)
-    return version.strip()
+    return ver.strip()


 try:
--- a/ocrmypdf/pipeline.py
+++ b/ocrmypdf/pipeline.py
@@ -2,19 +2,10 @@
 # © 2016 James R. Barlow: github.com/jbarlow83

 from contextlib import suppress
-from tempfile import mkdtemp
-from functools import partial
 import sys
 import os
-import re
 import shutil
-import warnings
-import multiprocessing
-import atexit
-import textwrap
 import img2pdf
-import logging
-import argparse

 import PyPDF2 as pypdf
 from PIL import Image
@@ -23,7 +14,7 @@ from ruffus import formatter, regex, Pipeline, suffix

 from .hocrtransform import HocrTransform
 from .pdfinfo import PdfInfo, Encoding, Colorspace
-from .pdfa import generate_pdfa_ps, file_claims_pdfa
+from .pdfa import generate_pdfa_ps
 from .helpers import re_symlink, is_iterable_notstr, page_number
 from .exec import ghostscript, tesseract, qpdf
 from .exceptions import *
@@ -52,6 +43,7 @@ class JobContext:
    def __init__(self):
        self.pdfinfo = None
        self.options = None
+        self.work_folder = None

    def generate_pdfinfo(self, infile):
        self.pdfinfo = PdfInfo(infile)
@@ -190,7 +182,8 @@ def repair_pdf(
    pdfinfo = PdfInfo(output_file)

    if pdfinfo.has_userunit and options.output_type == 'pdfa':
-        log.error("This input file uses a PDF feature that is not supported "
+        log.error(
+            "This input file uses a PDF feature that is not supported "
            "by Ghostscript, so you cannot use --output-type=pdfa for this "
            "file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
            "support very large or small page sizes, and Ghostscript cannot "
@@ -827,9 +820,9 @@ def get_pdfmark(base_pdf, options):
        renderer_tag = 'OCR'

    pdfmark['/Creator'] = '{0} {1} / Tesseract {2} {3}'.format(
-            PROGRAM_NAME, VERSION,
-            renderer_tag,
-            tesseract.version())
+        PROGRAM_NAME, VERSION,
+        renderer_tag,
+        tesseract.version())
    return pdfmark


@@ -937,7 +930,6 @@ def merge_sidecars(
        output_file,
        log,
        context):
-    options = context.get_options()
    pdfinfo = context.get_pdfinfo()

    txt_files = [None] * len(pdfinfo)
@@ -948,8 +940,8 @@ def merge_sidecars(
            txt_files[idx] = infile

    def write_pages(stream):
-        for page_number, txt_file in enumerate(txt_files):
-            if page_number != 0:
+        for page_num, txt_file in enumerate(txt_files):
+            if page_num != 0:
                stream.write('\f')  # Form feed between pages
            if txt_file:
                with open(txt_file, 'r') as in_:
@@ -965,7 +957,7 @@ def merge_sidecars(
                        stream.write(in_.read())
            else:
                stream.write('[OCR skipped on page {}]'.format(
-                        page_number + 1))
+                        page_num + 1))

    if output_file == '-':
        write_pages(sys.stdout)
@@ -1149,7 +1141,8 @@ def build_pipeline(options, work_folder, log, context):
        output=os.path.join(work_folder, r'\1.rendered.pdf'),
        extras=[log, context])
    task_combine_layers.graphviz(fillcolor='"#00cc66"')
-    task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')
+    task_combine_layers.active_if(options.pdf_renderer == 'hocr' or 
+                                  options.pdf_renderer == 'sandwich')

    # Tesseract OCR+PDF
    task_ocr_tesseract_and_render_pdf = main_pipeline.collate(