Import cleanup and some pylint fixes

This commit is contained in:
James R. Barlow
2017-10-11 12:58:35 -07:00
parent 7d73098d6e
commit 51defa6d66
8 changed files with 64 additions and 67 deletions

View File

@@ -1,29 +1,25 @@
#!/usr/bin/env python3
# © 2015-17 James R. Barlow: github.com/jbarlow83
from contextlib import suppress
from tempfile import mkdtemp
from collections.abc import Sequence
import sys
import os
import re
import shutil
import warnings
import multiprocessing
import atexit
import textwrap
import img2pdf
import logging
import argparse
import PyPDF2 as pypdf
from PIL import Image
import ruffus.ruffus_exceptions as ruffus_exceptions
import ruffus.cmdline as cmdline
import ruffus.proxy_logger as proxy_logger
from .pipeline import JobContext, JobContextManager, re_symlink, \
from .pipeline import JobContext, JobContextManager, \
cleanup_working_files, build_pipeline
from .pdfa import file_claims_pdfa
from .helpers import is_iterable_notstr, re_symlink, is_file_writable
@@ -445,9 +441,9 @@ def check_options_advanced(options, log):
def check_options_metadata(options, log):
import unicodedata
metadata = [options.title, options.author, options.keywords,
options.subject]
for s in (m for m in metadata if m):
docinfo = [options.title, options.author, options.keywords,
options.subject]
for s in (m for m in docinfo if m):
for c in s:
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
raise ValueError(
@@ -529,31 +525,31 @@ def cleanup_ruffus_error_message(msg):
def do_ruffus_exception(ruffus_five_tuple, options, log):
"""Replace the elaborate ruffus stack trace with a user friendly
description of the error message that occurred."""
exit_code = None
task_name, job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
if exc_name == 'builtins.SystemExit':
match = re.search(r"\.(.+?)\)", exc_value)
exit_code_name = match.groups()[0]
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
return exit_code
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
log.error(cleanup_ruffus_error_message(exc_value))
return ExitCode.input_file
exit_code = ExitCode.input_file
elif exc_name == 'builtins.TypeError':
# Even though repair_pdf will fail, ruffus will still try
# to call split_pages with no input files, likely due to a bug
if task_name == 'split_pages':
log.error("Input file '{0}' is not a valid PDF".format(
options.input_file))
return ExitCode.input_file
exit_code = ExitCode.input_file
elif exc_name == 'builtins.KeyboardInterrupt':
log.error("Interrupted by user")
return ExitCode.ctrl_c
exit_code = ExitCode.ctrl_c
elif exc_name == 'subprocess.CalledProcessError':
# It's up to the subprocess handler to report something useful
msg = "Error occurred while running this command:"
log.error(msg + '\n' + exc_value)
return ExitCode.child_process_error
exit_code = ExitCode.child_process_error
elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError':
log.error(textwrap.dedent("""\
Failed to merge PDF image layer with OCR layer
@@ -564,11 +560,11 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
Try using
ocrmypdf --pdf-renderer tesseract [..other args..]
"""))
return ExitCode.input_file
exit_code = ExitCode.input_file
elif exc_name.startswith('ocrmypdf.exceptions.'):
base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
return exc_class.exit_code
exit_code = exc_class.exit_code
elif exc_name == 'PyPDF2.utils.PdfReadError' and \
'not been decrypted' in exc_value:
log.error(textwrap.dedent("""\
@@ -581,7 +577,10 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
(Only algorithms "R = 1" and "R = 2" are supported.)
"""))
return ExitCode.encrypted_pdf
exit_code = ExitCode.encrypted_pdf
if exit_code is not None:
return exit_code
if not options.verbose:
log.error(exc_stack)
@@ -647,6 +646,20 @@ def check_closed_streams(options):
return True
def log_page_orientations(pdfinfo, _log):
direction = {0: 'n', 90: 'e',
180: 's', 270: 'w'}
orientations = []
for n, page in enumerate(pdfinfo):
angle = pdfinfo[n].rotation or 0
if angle != 0:
orientations.append('{0}{1}'.format(
n + 1,
direction.get(angle, '')))
if orientations:
_log.info('Page orientations detected: ' + ' '.join(orientations))
def run_pipeline():
options = parser.parse_args()
options.verbose_abbreviated_path = 1
@@ -736,8 +749,7 @@ def run_pipeline():
_log.error("Unexpected ruffus exception: " + str(e))
_log.error(repr(e))
return ExitCode.other_error
else:
return exitcode
return exitcode
except ExitCodeException as e:
return e.exit_code
except Exception as e:
@@ -767,17 +779,8 @@ def run_pipeline():
if options.verbose:
from pprint import pformat
_log.debug(pformat(pdfinfo))
direction = {0: 'n', 90: 'e',
180: 's', 270: 'w'}
orientations = []
for n, page in enumerate(pdfinfo):
angle = pdfinfo[n].rotation or 0
if angle != 0:
orientations.append('{0}{1}'.format(
n + 1,
direction.get(angle, '')))
if orientations:
_log.info('Page orientations detected: ' + ' '.join(orientations))
log_page_orientations(pdfinfo, _log)
return ExitCode.ok

View File

@@ -20,7 +20,7 @@ class ExitCode(IntEnum):
class ExitCodeException(Exception):
pass
exit_code = ExitCode.other_error
class PdfMergeFailedError(ExitCodeException):

View File

@@ -7,5 +7,6 @@ import os
def get_program(name):
"Check environment variables for overrides to this program"
envvar = 'OCRMYPDF_' + name.upper()
return os.environ.get(envvar, name)

View File

@@ -7,9 +7,9 @@ from shutil import copy
from functools import lru_cache
import re
import sys
from PIL import Image
from . import get_program
from ..exceptions import SubprocessOutputError, MissingDependencyError
from PIL import Image
from ..helpers import fspath

View File

@@ -1,5 +1,5 @@
#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83
# © 2017 James R. Barlow: github.com/jbarlow83
from subprocess import CalledProcessError, STDOUT, PIPE, run, check_output
from functools import lru_cache
@@ -134,4 +134,4 @@ def merge(input_files, output_file, min_version=None):
input_files[0], '--pages'
] + input_files + ['--', output_file]
run(args_qpdf, check=True)

View File

@@ -1,21 +1,20 @@
#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83
# © 2017 James R. Barlow: github.com/jbarlow83
import sys
import os
import re
import shutil
from functools import lru_cache
from ..exceptions import MissingDependencyError, TesseractConfigError
from ..helpers import page_number
from . import get_program
from collections import namedtuple
from textwrap import dedent
import PyPDF2 as pypdf
from subprocess import PIPE, CalledProcessError, \
TimeoutExpired, check_output, STDOUT
from subprocess import Popen, PIPE, CalledProcessError, \
TimeoutExpired, check_output, STDOUT, DEVNULL
from ..exceptions import MissingDependencyError, TesseractConfigError
from ..helpers import page_number
from . import get_program
OrientationConfidence = namedtuple(
'OrientationConfidence',
@@ -60,7 +59,7 @@ def version():
def v4():
"Is this Tesseract v4.0?"
return (version() >= '4')
return version() >= '4'
@lru_cache(maxsize=1)
@@ -74,6 +73,7 @@ def has_textonly_pdf():
get_program('tesseract'),
'--print-parameters'
]
params = ''
try:
params = check_output(
args_tess, close_fds=True, universal_newlines=True,
@@ -113,12 +113,12 @@ def languages():
return set(lang.strip() for lang in langs.splitlines()[1:])
def tess_base_args(languages, engine_mode):
def tess_base_args(langs, engine_mode):
args = [
get_program('tesseract'),
]
if languages:
args.extend(['-l', '+'.join(languages)])
if langs:
args.extend(['-l', '+'.join(langs)])
if engine_mode is not None and v4():
args.extend(['--oem', str(engine_mode)])
return args

View File

@@ -3,7 +3,7 @@
# unpaper documentation:
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
from subprocess import CalledProcessError, STDOUT, check_output, check_call
from subprocess import CalledProcessError, STDOUT, check_output
from tempfile import NamedTemporaryFile
import sys
import os
@@ -18,10 +18,10 @@ def version():
get_program('unpaper'),
'--version'
]
version = check_output(
ver = check_output(
args_unpaper, close_fds=True, universal_newlines=True,
stderr=STDOUT, timeout=5)
return version.strip()
return ver.strip()
try:

View File

@@ -2,19 +2,10 @@
# © 2016 James R. Barlow: github.com/jbarlow83
from contextlib import suppress
from tempfile import mkdtemp
from functools import partial
import sys
import os
import re
import shutil
import warnings
import multiprocessing
import atexit
import textwrap
import img2pdf
import logging
import argparse
import PyPDF2 as pypdf
from PIL import Image
@@ -23,7 +14,7 @@ from ruffus import formatter, regex, Pipeline, suffix
from .hocrtransform import HocrTransform
from .pdfinfo import PdfInfo, Encoding, Colorspace
from .pdfa import generate_pdfa_ps, file_claims_pdfa
from .pdfa import generate_pdfa_ps
from .helpers import re_symlink, is_iterable_notstr, page_number
from .exec import ghostscript, tesseract, qpdf
from .exceptions import *
@@ -52,6 +43,7 @@ class JobContext:
def __init__(self):
self.pdfinfo = None
self.options = None
self.work_folder = None
def generate_pdfinfo(self, infile):
self.pdfinfo = PdfInfo(infile)
@@ -190,7 +182,8 @@ def repair_pdf(
pdfinfo = PdfInfo(output_file)
if pdfinfo.has_userunit and options.output_type == 'pdfa':
log.error("This input file uses a PDF feature that is not supported "
log.error(
"This input file uses a PDF feature that is not supported "
"by Ghostscript, so you cannot use --output-type=pdfa for this "
"file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
"support very large or small page sizes, and Ghostscript cannot "
@@ -827,9 +820,9 @@ def get_pdfmark(base_pdf, options):
renderer_tag = 'OCR'
pdfmark['/Creator'] = '{0} {1} / Tesseract {2} {3}'.format(
PROGRAM_NAME, VERSION,
renderer_tag,
tesseract.version())
PROGRAM_NAME, VERSION,
renderer_tag,
tesseract.version())
return pdfmark
@@ -937,7 +930,6 @@ def merge_sidecars(
output_file,
log,
context):
options = context.get_options()
pdfinfo = context.get_pdfinfo()
txt_files = [None] * len(pdfinfo)
@@ -948,8 +940,8 @@ def merge_sidecars(
txt_files[idx] = infile
def write_pages(stream):
for page_number, txt_file in enumerate(txt_files):
if page_number != 0:
for page_num, txt_file in enumerate(txt_files):
if page_num != 0:
stream.write('\f') # Form feed between pages
if txt_file:
with open(txt_file, 'r') as in_:
@@ -965,7 +957,7 @@ def merge_sidecars(
stream.write(in_.read())
else:
stream.write('[OCR skipped on page {}]'.format(
page_number + 1))
page_num + 1))
if output_file == '-':
write_pages(sys.stdout)
@@ -1149,7 +1141,8 @@ def build_pipeline(options, work_folder, log, context):
output=os.path.join(work_folder, r'\1.rendered.pdf'),
extras=[log, context])
task_combine_layers.graphviz(fillcolor='"#00cc66"')
task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')
task_combine_layers.active_if(options.pdf_renderer == 'hocr' or
options.pdf_renderer == 'sandwich')
# Tesseract OCR+PDF
task_ocr_tesseract_and_render_pdf = main_pipeline.collate(