From 51defa6d662855e2bb34ff053d4c69ca76783706 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Wed, 11 Oct 2017 12:58:35 -0700
Subject: [PATCH] Import cleanup and some pylint fixes

---
 ocrmypdf/__main__.py         | 63 +++++++++++++++++++-----------------
 ocrmypdf/exceptions.py       |  2 +-
 ocrmypdf/exec/__init__.py    |  1 +
 ocrmypdf/exec/ghostscript.py |  2 +-
 ocrmypdf/exec/qpdf.py        |  4 +--
 ocrmypdf/exec/tesseract.py   | 22 ++++++-------
 ocrmypdf/exec/unpaper.py     |  6 ++--
 ocrmypdf/pipeline.py         | 31 +++++++-----------
 8 files changed, 64 insertions(+), 67 deletions(-)

diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py
index 5784fb6e..4d671ee5 100755
--- a/ocrmypdf/__main__.py
+++ b/ocrmypdf/__main__.py
@@ -1,29 +1,25 @@
 #!/usr/bin/env python3
 # © 2015-17 James R. Barlow: github.com/jbarlow83
 
-from contextlib import suppress
 from tempfile import mkdtemp
 from collections.abc import Sequence
 import sys
 import os
 import re
-import shutil
 import warnings
 import multiprocessing
 import atexit
 import textwrap
-import img2pdf
 import logging
 import argparse
 
 import PyPDF2 as pypdf
-from PIL import Image
 
 import ruffus.ruffus_exceptions as ruffus_exceptions
 import ruffus.cmdline as cmdline
 import ruffus.proxy_logger as proxy_logger
 
-from .pipeline import JobContext, JobContextManager, re_symlink, \
+from .pipeline import JobContext, JobContextManager, \
     cleanup_working_files, build_pipeline
 from .pdfa import file_claims_pdfa
 from .helpers import is_iterable_notstr, re_symlink, is_file_writable
@@ -445,9 +441,9 @@ def check_options_advanced(options, log):
 
 def check_options_metadata(options, log):
     import unicodedata
-    metadata = [options.title, options.author, options.keywords,
-                options.subject]
-    for s in (m for m in metadata if m):
+    docinfo = [options.title, options.author, options.keywords,
+               options.subject]
+    for s in (m for m in docinfo if m):
         for c in s:
             if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
                 raise ValueError(
@@ -529,31 +525,31 @@ def cleanup_ruffus_error_message(msg):
 def do_ruffus_exception(ruffus_five_tuple, options, log):
     """Replace the elaborate ruffus stack trace with a user friendly
     description of the error message that occurred."""
+    exit_code = None
 
     task_name, job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
     if exc_name == 'builtins.SystemExit':
         match = re.search(r"\.(.+?)\)", exc_value)
         exit_code_name = match.groups()[0]
-        exit_code = getattr(ExitCode, exit_code_name, 'other_error')
-        return exit_code
+        exit_code = getattr(ExitCode, exit_code_name, 'other_error')        
     elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
         log.error(cleanup_ruffus_error_message(exc_value))
-        return ExitCode.input_file
+        exit_code = ExitCode.input_file
     elif exc_name == 'builtins.TypeError':
         # Even though repair_pdf will fail, ruffus will still try
         # to call split_pages with no input files, likely due to a bug
         if task_name == 'split_pages':
             log.error("Input file '{0}' is not a valid PDF".format(
                 options.input_file))
-            return ExitCode.input_file
+            exit_code = ExitCode.input_file
     elif exc_name == 'builtins.KeyboardInterrupt':
         log.error("Interrupted by user")
-        return ExitCode.ctrl_c
+        exit_code = ExitCode.ctrl_c
     elif exc_name == 'subprocess.CalledProcessError':
         # It's up to the subprocess handler to report something useful
         msg = "Error occurred while running this command:"
         log.error(msg + '\n' + exc_value)
-        return ExitCode.child_process_error
+        exit_code = ExitCode.child_process_error
     elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError':
         log.error(textwrap.dedent("""\
             Failed to merge PDF image layer with OCR layer
@@ -564,11 +560,11 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
             Try using
                 ocrmypdf --pdf-renderer tesseract  [..other args..]
             """))
-        return ExitCode.input_file
+        exit_code = ExitCode.input_file
     elif exc_name.startswith('ocrmypdf.exceptions.'):
         base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
         exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
-        return exc_class.exit_code
+        exit_code = exc_class.exit_code
     elif exc_name == 'PyPDF2.utils.PdfReadError' and \
             'not been decrypted' in exc_value:
         log.error(textwrap.dedent("""\
@@ -581,7 +577,10 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
             (Only algorithms "R = 1" and "R = 2" are supported.)
 
             """))
-        return ExitCode.encrypted_pdf
+        exit_code = ExitCode.encrypted_pdf
+
+    if exit_code is not None:
+        return exit_code
 
     if not options.verbose:
         log.error(exc_stack)
@@ -647,6 +646,20 @@ def check_closed_streams(options):
     return True
 
 
+def log_page_orientations(pdfinfo, _log):
+    direction = {0: 'n', 90: 'e',
+                180: 's', 270: 'w'}
+    orientations = []
+    for n, page in enumerate(pdfinfo):
+        angle = pdfinfo[n].rotation or 0
+        if angle != 0:
+            orientations.append('{0}{1}'.format(
+                n + 1,
+                direction.get(angle, '')))
+    if orientations:
+        _log.info('Page orientations detected: ' + ' '.join(orientations))
+
+
 def run_pipeline():
     options = parser.parse_args()
     options.verbose_abbreviated_path = 1
@@ -736,8 +749,7 @@ def run_pipeline():
             _log.error("Unexpected ruffus exception: " + str(e))
             _log.error(repr(e))
             return ExitCode.other_error
-        else:
-            return exitcode
+        return exitcode
     except ExitCodeException as e:
         return e.exit_code
     except Exception as e:
@@ -767,17 +779,8 @@ def run_pipeline():
     if options.verbose:
         from pprint import pformat
         _log.debug(pformat(pdfinfo))
-    direction = {0: 'n', 90: 'e',
-                 180: 's', 270: 'w'}
-    orientations = []
-    for n, page in enumerate(pdfinfo):
-        angle = pdfinfo[n].rotation or 0
-        if angle != 0:
-            orientations.append('{0}{1}'.format(
-                n + 1,
-                direction.get(angle, '')))
-    if orientations:
-        _log.info('Page orientations detected: ' + ' '.join(orientations))
+
+    log_page_orientations(pdfinfo, _log)
 
     return ExitCode.ok
 
diff --git a/ocrmypdf/exceptions.py b/ocrmypdf/exceptions.py
index c68bcf9f..8031040f 100644
--- a/ocrmypdf/exceptions.py
+++ b/ocrmypdf/exceptions.py
@@ -20,7 +20,7 @@ class ExitCode(IntEnum):
 
 
 class ExitCodeException(Exception):
-    pass
+    exit_code = ExitCode.other_error
 
 
 class PdfMergeFailedError(ExitCodeException):
diff --git a/ocrmypdf/exec/__init__.py b/ocrmypdf/exec/__init__.py
index e4a29fa8..9959e87a 100644
--- a/ocrmypdf/exec/__init__.py
+++ b/ocrmypdf/exec/__init__.py
@@ -7,5 +7,6 @@ import os
 
 
 def get_program(name):
+    "Check environment variables for overrides to this program"
     envvar = 'OCRMYPDF_' + name.upper()
     return os.environ.get(envvar, name)
diff --git a/ocrmypdf/exec/ghostscript.py b/ocrmypdf/exec/ghostscript.py
index d53b867a..4190f9c4 100644
--- a/ocrmypdf/exec/ghostscript.py
+++ b/ocrmypdf/exec/ghostscript.py
@@ -7,9 +7,9 @@ from shutil import copy
 from functools import lru_cache
 import re
 import sys
+from PIL import Image
 from . import get_program
 from ..exceptions import SubprocessOutputError, MissingDependencyError
-from PIL import Image
 from ..helpers import fspath
 
 
diff --git a/ocrmypdf/exec/qpdf.py b/ocrmypdf/exec/qpdf.py
index 2ba8a5b6..d989a7bf 100644
--- a/ocrmypdf/exec/qpdf.py
+++ b/ocrmypdf/exec/qpdf.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# © 2015 James R. Barlow: github.com/jbarlow83
+# © 2017 James R. Barlow: github.com/jbarlow83
 
 from subprocess import CalledProcessError, STDOUT, PIPE, run, check_output
 from functools import lru_cache
@@ -134,4 +134,4 @@ def merge(input_files, output_file, min_version=None):
         input_files[0], '--pages'
     ] + input_files + ['--', output_file]
     run(args_qpdf, check=True)
-
+    
\ No newline at end of file
diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py
index 87d655e4..ccea07e5 100644
--- a/ocrmypdf/exec/tesseract.py
+++ b/ocrmypdf/exec/tesseract.py
@@ -1,21 +1,20 @@
 #!/usr/bin/env python3
-# © 2015 James R. Barlow: github.com/jbarlow83
+# © 2017 James R. Barlow: github.com/jbarlow83
 
 import sys
 import os
 import re
 import shutil
 from functools import lru_cache
-from ..exceptions import MissingDependencyError, TesseractConfigError
-from ..helpers import page_number
-from . import get_program
 from collections import namedtuple
 from textwrap import dedent
 import PyPDF2 as pypdf
+from subprocess import PIPE, CalledProcessError, \
+    TimeoutExpired, check_output, STDOUT
 
-from subprocess import Popen, PIPE, CalledProcessError, \
-    TimeoutExpired, check_output, STDOUT, DEVNULL
-
+from ..exceptions import MissingDependencyError, TesseractConfigError
+from ..helpers import page_number
+from . import get_program
 
 OrientationConfidence = namedtuple(
     'OrientationConfidence',
@@ -60,7 +59,7 @@ def version():
 
 def v4():
     "Is this Tesseract v4.0?"
-    return (version() >= '4')
+    return version() >= '4'
 
 
 @lru_cache(maxsize=1)
@@ -74,6 +73,7 @@ def has_textonly_pdf():
         get_program('tesseract'),
         '--print-parameters'
     ]
+    params = ''
     try:
         params = check_output(
                 args_tess, close_fds=True, universal_newlines=True,
@@ -113,12 +113,12 @@ def languages():
     return set(lang.strip() for lang in langs.splitlines()[1:])
 
 
-def tess_base_args(languages, engine_mode):
+def tess_base_args(langs, engine_mode):
     args = [
         get_program('tesseract'),
     ]
-    if languages:
-        args.extend(['-l', '+'.join(languages)])
+    if langs:
+        args.extend(['-l', '+'.join(langs)])
     if engine_mode is not None and v4():
         args.extend(['--oem', str(engine_mode)])
     return args
diff --git a/ocrmypdf/exec/unpaper.py b/ocrmypdf/exec/unpaper.py
index 57f9eb08..e1ceade8 100644
--- a/ocrmypdf/exec/unpaper.py
+++ b/ocrmypdf/exec/unpaper.py
@@ -3,7 +3,7 @@
 # unpaper documentation:
 # https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
 
-from subprocess import CalledProcessError, STDOUT, check_output, check_call
+from subprocess import CalledProcessError, STDOUT, check_output
 from tempfile import NamedTemporaryFile
 import sys
 import os
@@ -18,10 +18,10 @@ def version():
         get_program('unpaper'),
         '--version'
     ]
-    version = check_output(
+    ver = check_output(
         args_unpaper, close_fds=True, universal_newlines=True,
         stderr=STDOUT, timeout=5)
-    return version.strip()
+    return ver.strip()
 
 
 try:
diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py
index ff58da2f..487640c2 100644
--- a/ocrmypdf/pipeline.py
+++ b/ocrmypdf/pipeline.py
@@ -2,19 +2,10 @@
 # © 2016 James R. Barlow: github.com/jbarlow83
 
 from contextlib import suppress
-from tempfile import mkdtemp
-from functools import partial
 import sys
 import os
-import re
 import shutil
-import warnings
-import multiprocessing
-import atexit
-import textwrap
 import img2pdf
-import logging
-import argparse
 
 import PyPDF2 as pypdf
 from PIL import Image
@@ -23,7 +14,7 @@ from ruffus import formatter, regex, Pipeline, suffix
 
 from .hocrtransform import HocrTransform
 from .pdfinfo import PdfInfo, Encoding, Colorspace
-from .pdfa import generate_pdfa_ps, file_claims_pdfa
+from .pdfa import generate_pdfa_ps
 from .helpers import re_symlink, is_iterable_notstr, page_number
 from .exec import ghostscript, tesseract, qpdf
 from .exceptions import *
@@ -52,6 +43,7 @@ class JobContext:
     def __init__(self):
         self.pdfinfo = None
         self.options = None
+        self.work_folder = None
 
     def generate_pdfinfo(self, infile):
         self.pdfinfo = PdfInfo(infile)
@@ -190,7 +182,8 @@ def repair_pdf(
     pdfinfo = PdfInfo(output_file)
 
     if pdfinfo.has_userunit and options.output_type == 'pdfa':
-        log.error("This input file uses a PDF feature that is not supported "
+        log.error(
+            "This input file uses a PDF feature that is not supported "
             "by Ghostscript, so you cannot use --output-type=pdfa for this "
             "file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
             "support very large or small page sizes, and Ghostscript cannot "
@@ -827,9 +820,9 @@ def get_pdfmark(base_pdf, options):
         renderer_tag = 'OCR'
 
     pdfmark['/Creator'] = '{0} {1} / Tesseract {2} {3}'.format(
-            PROGRAM_NAME, VERSION,
-            renderer_tag,
-            tesseract.version())
+        PROGRAM_NAME, VERSION,
+        renderer_tag,
+        tesseract.version())
     return pdfmark
 
 
@@ -937,7 +930,6 @@ def merge_sidecars(
         output_file,
         log,
         context):
-    options = context.get_options()
     pdfinfo = context.get_pdfinfo()
 
     txt_files = [None] * len(pdfinfo)
@@ -948,8 +940,8 @@ def merge_sidecars(
             txt_files[idx] = infile
 
     def write_pages(stream):
-        for page_number, txt_file in enumerate(txt_files):
-            if page_number != 0:
+        for page_num, txt_file in enumerate(txt_files):
+            if page_num != 0:
                 stream.write('\f')  # Form feed between pages
             if txt_file:
                 with open(txt_file, 'r') as in_:
@@ -965,7 +957,7 @@ def merge_sidecars(
                         stream.write(in_.read())
             else:
                 stream.write('[OCR skipped on page {}]'.format(
-                        page_number + 1))
+                        page_num + 1))
 
     if output_file == '-':
         write_pages(sys.stdout)
@@ -1149,7 +1141,8 @@ def build_pipeline(options, work_folder, log, context):
         output=os.path.join(work_folder, r'\1.rendered.pdf'),
         extras=[log, context])
     task_combine_layers.graphviz(fillcolor='"#00cc66"')
-    task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')
+    task_combine_layers.active_if(options.pdf_renderer == 'hocr' or 
+                                  options.pdf_renderer == 'sandwich')
 
     # Tesseract OCR+PDF
     task_ocr_tesseract_and_render_pdf = main_pipeline.collate(