mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-19 20:14:53 -04:00
Import cleanup and some pylint fixes
This commit is contained in:
@@ -1,29 +1,25 @@
|
||||
#!/usr/bin/env python3
|
||||
# © 2015-17 James R. Barlow: github.com/jbarlow83
|
||||
|
||||
from contextlib import suppress
|
||||
from tempfile import mkdtemp
|
||||
from collections.abc import Sequence
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import warnings
|
||||
import multiprocessing
|
||||
import atexit
|
||||
import textwrap
|
||||
import img2pdf
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
import PyPDF2 as pypdf
|
||||
from PIL import Image
|
||||
|
||||
import ruffus.ruffus_exceptions as ruffus_exceptions
|
||||
import ruffus.cmdline as cmdline
|
||||
import ruffus.proxy_logger as proxy_logger
|
||||
|
||||
from .pipeline import JobContext, JobContextManager, re_symlink, \
|
||||
from .pipeline import JobContext, JobContextManager, \
|
||||
cleanup_working_files, build_pipeline
|
||||
from .pdfa import file_claims_pdfa
|
||||
from .helpers import is_iterable_notstr, re_symlink, is_file_writable
|
||||
@@ -445,9 +441,9 @@ def check_options_advanced(options, log):
|
||||
|
||||
def check_options_metadata(options, log):
|
||||
import unicodedata
|
||||
metadata = [options.title, options.author, options.keywords,
|
||||
options.subject]
|
||||
for s in (m for m in metadata if m):
|
||||
docinfo = [options.title, options.author, options.keywords,
|
||||
options.subject]
|
||||
for s in (m for m in docinfo if m):
|
||||
for c in s:
|
||||
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
|
||||
raise ValueError(
|
||||
@@ -529,31 +525,31 @@ def cleanup_ruffus_error_message(msg):
|
||||
def do_ruffus_exception(ruffus_five_tuple, options, log):
|
||||
"""Replace the elaborate ruffus stack trace with a user friendly
|
||||
description of the error message that occurred."""
|
||||
exit_code = None
|
||||
|
||||
task_name, job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
|
||||
if exc_name == 'builtins.SystemExit':
|
||||
match = re.search(r"\.(.+?)\)", exc_value)
|
||||
exit_code_name = match.groups()[0]
|
||||
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
|
||||
return exit_code
|
||||
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
|
||||
elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
|
||||
log.error(cleanup_ruffus_error_message(exc_value))
|
||||
return ExitCode.input_file
|
||||
exit_code = ExitCode.input_file
|
||||
elif exc_name == 'builtins.TypeError':
|
||||
# Even though repair_pdf will fail, ruffus will still try
|
||||
# to call split_pages with no input files, likely due to a bug
|
||||
if task_name == 'split_pages':
|
||||
log.error("Input file '{0}' is not a valid PDF".format(
|
||||
options.input_file))
|
||||
return ExitCode.input_file
|
||||
exit_code = ExitCode.input_file
|
||||
elif exc_name == 'builtins.KeyboardInterrupt':
|
||||
log.error("Interrupted by user")
|
||||
return ExitCode.ctrl_c
|
||||
exit_code = ExitCode.ctrl_c
|
||||
elif exc_name == 'subprocess.CalledProcessError':
|
||||
# It's up to the subprocess handler to report something useful
|
||||
msg = "Error occurred while running this command:"
|
||||
log.error(msg + '\n' + exc_value)
|
||||
return ExitCode.child_process_error
|
||||
exit_code = ExitCode.child_process_error
|
||||
elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError':
|
||||
log.error(textwrap.dedent("""\
|
||||
Failed to merge PDF image layer with OCR layer
|
||||
@@ -564,11 +560,11 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
|
||||
Try using
|
||||
ocrmypdf --pdf-renderer tesseract [..other args..]
|
||||
"""))
|
||||
return ExitCode.input_file
|
||||
exit_code = ExitCode.input_file
|
||||
elif exc_name.startswith('ocrmypdf.exceptions.'):
|
||||
base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
|
||||
exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
|
||||
return exc_class.exit_code
|
||||
exit_code = exc_class.exit_code
|
||||
elif exc_name == 'PyPDF2.utils.PdfReadError' and \
|
||||
'not been decrypted' in exc_value:
|
||||
log.error(textwrap.dedent("""\
|
||||
@@ -581,7 +577,10 @@ def do_ruffus_exception(ruffus_five_tuple, options, log):
|
||||
(Only algorithms "R = 1" and "R = 2" are supported.)
|
||||
|
||||
"""))
|
||||
return ExitCode.encrypted_pdf
|
||||
exit_code = ExitCode.encrypted_pdf
|
||||
|
||||
if exit_code is not None:
|
||||
return exit_code
|
||||
|
||||
if not options.verbose:
|
||||
log.error(exc_stack)
|
||||
@@ -647,6 +646,20 @@ def check_closed_streams(options):
|
||||
return True
|
||||
|
||||
|
||||
def log_page_orientations(pdfinfo, _log):
|
||||
direction = {0: 'n', 90: 'e',
|
||||
180: 's', 270: 'w'}
|
||||
orientations = []
|
||||
for n, page in enumerate(pdfinfo):
|
||||
angle = pdfinfo[n].rotation or 0
|
||||
if angle != 0:
|
||||
orientations.append('{0}{1}'.format(
|
||||
n + 1,
|
||||
direction.get(angle, '')))
|
||||
if orientations:
|
||||
_log.info('Page orientations detected: ' + ' '.join(orientations))
|
||||
|
||||
|
||||
def run_pipeline():
|
||||
options = parser.parse_args()
|
||||
options.verbose_abbreviated_path = 1
|
||||
@@ -736,8 +749,7 @@ def run_pipeline():
|
||||
_log.error("Unexpected ruffus exception: " + str(e))
|
||||
_log.error(repr(e))
|
||||
return ExitCode.other_error
|
||||
else:
|
||||
return exitcode
|
||||
return exitcode
|
||||
except ExitCodeException as e:
|
||||
return e.exit_code
|
||||
except Exception as e:
|
||||
@@ -767,17 +779,8 @@ def run_pipeline():
|
||||
if options.verbose:
|
||||
from pprint import pformat
|
||||
_log.debug(pformat(pdfinfo))
|
||||
direction = {0: 'n', 90: 'e',
|
||||
180: 's', 270: 'w'}
|
||||
orientations = []
|
||||
for n, page in enumerate(pdfinfo):
|
||||
angle = pdfinfo[n].rotation or 0
|
||||
if angle != 0:
|
||||
orientations.append('{0}{1}'.format(
|
||||
n + 1,
|
||||
direction.get(angle, '')))
|
||||
if orientations:
|
||||
_log.info('Page orientations detected: ' + ' '.join(orientations))
|
||||
|
||||
log_page_orientations(pdfinfo, _log)
|
||||
|
||||
return ExitCode.ok
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ class ExitCode(IntEnum):
|
||||
|
||||
|
||||
class ExitCodeException(Exception):
|
||||
pass
|
||||
exit_code = ExitCode.other_error
|
||||
|
||||
|
||||
class PdfMergeFailedError(ExitCodeException):
|
||||
|
||||
@@ -7,5 +7,6 @@ import os
|
||||
|
||||
|
||||
def get_program(name):
|
||||
"Check environment variables for overrides to this program"
|
||||
envvar = 'OCRMYPDF_' + name.upper()
|
||||
return os.environ.get(envvar, name)
|
||||
|
||||
@@ -7,9 +7,9 @@ from shutil import copy
|
||||
from functools import lru_cache
|
||||
import re
|
||||
import sys
|
||||
from PIL import Image
|
||||
from . import get_program
|
||||
from ..exceptions import SubprocessOutputError, MissingDependencyError
|
||||
from PIL import Image
|
||||
from ..helpers import fspath
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
# © 2015 James R. Barlow: github.com/jbarlow83
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
|
||||
from subprocess import CalledProcessError, STDOUT, PIPE, run, check_output
|
||||
from functools import lru_cache
|
||||
@@ -134,4 +134,4 @@ def merge(input_files, output_file, min_version=None):
|
||||
input_files[0], '--pages'
|
||||
] + input_files + ['--', output_file]
|
||||
run(args_qpdf, check=True)
|
||||
|
||||
|
||||
@@ -1,21 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
# © 2015 James R. Barlow: github.com/jbarlow83
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from functools import lru_cache
|
||||
from ..exceptions import MissingDependencyError, TesseractConfigError
|
||||
from ..helpers import page_number
|
||||
from . import get_program
|
||||
from collections import namedtuple
|
||||
from textwrap import dedent
|
||||
import PyPDF2 as pypdf
|
||||
from subprocess import PIPE, CalledProcessError, \
|
||||
TimeoutExpired, check_output, STDOUT
|
||||
|
||||
from subprocess import Popen, PIPE, CalledProcessError, \
|
||||
TimeoutExpired, check_output, STDOUT, DEVNULL
|
||||
|
||||
from ..exceptions import MissingDependencyError, TesseractConfigError
|
||||
from ..helpers import page_number
|
||||
from . import get_program
|
||||
|
||||
OrientationConfidence = namedtuple(
|
||||
'OrientationConfidence',
|
||||
@@ -60,7 +59,7 @@ def version():
|
||||
|
||||
def v4():
|
||||
"Is this Tesseract v4.0?"
|
||||
return (version() >= '4')
|
||||
return version() >= '4'
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
@@ -74,6 +73,7 @@ def has_textonly_pdf():
|
||||
get_program('tesseract'),
|
||||
'--print-parameters'
|
||||
]
|
||||
params = ''
|
||||
try:
|
||||
params = check_output(
|
||||
args_tess, close_fds=True, universal_newlines=True,
|
||||
@@ -113,12 +113,12 @@ def languages():
|
||||
return set(lang.strip() for lang in langs.splitlines()[1:])
|
||||
|
||||
|
||||
def tess_base_args(languages, engine_mode):
|
||||
def tess_base_args(langs, engine_mode):
|
||||
args = [
|
||||
get_program('tesseract'),
|
||||
]
|
||||
if languages:
|
||||
args.extend(['-l', '+'.join(languages)])
|
||||
if langs:
|
||||
args.extend(['-l', '+'.join(langs)])
|
||||
if engine_mode is not None and v4():
|
||||
args.extend(['--oem', str(engine_mode)])
|
||||
return args
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
# unpaper documentation:
|
||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||
|
||||
from subprocess import CalledProcessError, STDOUT, check_output, check_call
|
||||
from subprocess import CalledProcessError, STDOUT, check_output
|
||||
from tempfile import NamedTemporaryFile
|
||||
import sys
|
||||
import os
|
||||
@@ -18,10 +18,10 @@ def version():
|
||||
get_program('unpaper'),
|
||||
'--version'
|
||||
]
|
||||
version = check_output(
|
||||
ver = check_output(
|
||||
args_unpaper, close_fds=True, universal_newlines=True,
|
||||
stderr=STDOUT, timeout=5)
|
||||
return version.strip()
|
||||
return ver.strip()
|
||||
|
||||
|
||||
try:
|
||||
|
||||
@@ -2,19 +2,10 @@
|
||||
# © 2016 James R. Barlow: github.com/jbarlow83
|
||||
|
||||
from contextlib import suppress
|
||||
from tempfile import mkdtemp
|
||||
from functools import partial
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import warnings
|
||||
import multiprocessing
|
||||
import atexit
|
||||
import textwrap
|
||||
import img2pdf
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
import PyPDF2 as pypdf
|
||||
from PIL import Image
|
||||
@@ -23,7 +14,7 @@ from ruffus import formatter, regex, Pipeline, suffix
|
||||
|
||||
from .hocrtransform import HocrTransform
|
||||
from .pdfinfo import PdfInfo, Encoding, Colorspace
|
||||
from .pdfa import generate_pdfa_ps, file_claims_pdfa
|
||||
from .pdfa import generate_pdfa_ps
|
||||
from .helpers import re_symlink, is_iterable_notstr, page_number
|
||||
from .exec import ghostscript, tesseract, qpdf
|
||||
from .exceptions import *
|
||||
@@ -52,6 +43,7 @@ class JobContext:
|
||||
def __init__(self):
|
||||
self.pdfinfo = None
|
||||
self.options = None
|
||||
self.work_folder = None
|
||||
|
||||
def generate_pdfinfo(self, infile):
|
||||
self.pdfinfo = PdfInfo(infile)
|
||||
@@ -190,7 +182,8 @@ def repair_pdf(
|
||||
pdfinfo = PdfInfo(output_file)
|
||||
|
||||
if pdfinfo.has_userunit and options.output_type == 'pdfa':
|
||||
log.error("This input file uses a PDF feature that is not supported "
|
||||
log.error(
|
||||
"This input file uses a PDF feature that is not supported "
|
||||
"by Ghostscript, so you cannot use --output-type=pdfa for this "
|
||||
"file. (Specifically, it uses the PDF-1.6 /UserUnit feature to "
|
||||
"support very large or small page sizes, and Ghostscript cannot "
|
||||
@@ -827,9 +820,9 @@ def get_pdfmark(base_pdf, options):
|
||||
renderer_tag = 'OCR'
|
||||
|
||||
pdfmark['/Creator'] = '{0} {1} / Tesseract {2} {3}'.format(
|
||||
PROGRAM_NAME, VERSION,
|
||||
renderer_tag,
|
||||
tesseract.version())
|
||||
PROGRAM_NAME, VERSION,
|
||||
renderer_tag,
|
||||
tesseract.version())
|
||||
return pdfmark
|
||||
|
||||
|
||||
@@ -937,7 +930,6 @@ def merge_sidecars(
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
|
||||
txt_files = [None] * len(pdfinfo)
|
||||
@@ -948,8 +940,8 @@ def merge_sidecars(
|
||||
txt_files[idx] = infile
|
||||
|
||||
def write_pages(stream):
|
||||
for page_number, txt_file in enumerate(txt_files):
|
||||
if page_number != 0:
|
||||
for page_num, txt_file in enumerate(txt_files):
|
||||
if page_num != 0:
|
||||
stream.write('\f') # Form feed between pages
|
||||
if txt_file:
|
||||
with open(txt_file, 'r') as in_:
|
||||
@@ -965,7 +957,7 @@ def merge_sidecars(
|
||||
stream.write(in_.read())
|
||||
else:
|
||||
stream.write('[OCR skipped on page {}]'.format(
|
||||
page_number + 1))
|
||||
page_num + 1))
|
||||
|
||||
if output_file == '-':
|
||||
write_pages(sys.stdout)
|
||||
@@ -1149,7 +1141,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
output=os.path.join(work_folder, r'\1.rendered.pdf'),
|
||||
extras=[log, context])
|
||||
task_combine_layers.graphviz(fillcolor='"#00cc66"')
|
||||
task_combine_layers.active_if(options.pdf_renderer == 'hocr' or options.pdf_renderer == 'sandwich')
|
||||
task_combine_layers.active_if(options.pdf_renderer == 'hocr' or
|
||||
options.pdf_renderer == 'sandwich')
|
||||
|
||||
# Tesseract OCR+PDF
|
||||
task_ocr_tesseract_and_render_pdf = main_pipeline.collate(
|
||||
|
||||
Reference in New Issue
Block a user