refactor: split argparse and run_pipline

This commit is contained in:
Martin Wind
2019-03-26 08:10:20 +01:00
parent 210f134b5b
commit 2fa43ecf09
4 changed files with 708 additions and 683 deletions

View File

@@ -108,7 +108,7 @@ setup(
],
extras_require={'pdfminer': ['pdfminer.six == 20181108']},
tests_require=tests_require,
entry_points={'console_scripts': ['ocrmypdf = ocrmypdf.__main__:run_pipeline']},
entry_points={'console_scripts': ['ocrmypdf = ocrmypdf.__main__:run']},
package_data={'ocrmypdf': ['data/sRGB.icc']},
include_package_data=True,
zip_safe=False,

View File

@@ -44,3 +44,4 @@ from . import hocrtransform
from . import leptonica
from . import pdfa
from . import pdfinfo
from .run import run_pipeline as run

View File

@@ -17,68 +17,19 @@
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import argparse
import atexit
import logging
import os
import re
import sys
import textwrap
from pathlib import Path
from tempfile import mkdtemp
import PIL
import ruffus.cmdline as cmdline
import ruffus.proxy_logger as proxy_logger
import ruffus.ruffus_exceptions as ruffus_exceptions
from . import PROGRAM_NAME, VERSION
from . import exceptions as ocrmypdf_exceptions
from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
from ._pipeline import build_pipeline
from ._unicodefun import verify_python3_env
from .exceptions import (
BadArgsError,
ExitCode,
ExitCodeException,
InputFileError,
MissingDependencyError,
OutputFileAccessError,
)
from .exec import (
ghostscript,
jbig2enc,
qpdf,
tesseract,
check_external_program,
unpaper,
pngquant,
)
from .helpers import available_cpu_count, is_file_writable, re_symlink
from .pdfa import file_claims_pdfa
# -------------
# External dependencies
HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
def complain(message):
print(*textwrap.wrap(message), file=sys.stderr)
from .run import run_pipeline
# Hack to help debugger context find /usr/local/bin
if 'IDE_PROJECT_ROOTS' in os.environ:
os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
# --------
# Critical environment tests
verify_python3_env()
# -------------
# Parser
def numeric(basetype, min_=None, max_=None):
"""Validator for numeric params"""
min_ = basetype(min_) if min_ is not None else None
@@ -514,638 +465,9 @@ debugging.add_argument(
'--flowchart', type=str, help="Generate the pipeline execution flowchart"
)
def check_options_languages(options, _log):
if not options.language:
options.language = ['eng'] # Enforce English hegemony
# Support v2.x "eng+deu" language syntax
if '+' in options.language[0]:
options.language = options.language[0].split('+')
languages = set(options.language)
if not languages.issubset(tesseract.languages()):
msg = (
"The installed version of tesseract does not have language "
"data for the following requested languages: \n"
)
for lang in languages - tesseract.languages():
msg += lang + '\n'
raise MissingDependencyError(msg)
def check_options_output(options, log):
# We have these constraints to check for.
# 1. Ghostscript < 9.20 mangles multibyte Unicode
# 2. hocr doesn't work on non-Latin languages (so don't select it)
languages = set(options.language)
is_latin = languages.issubset(HOCR_OK_LANGS)
if options.pdf_renderer == 'hocr' and not is_latin:
msg = (
"The 'hocr' PDF renderer is known to cause problems with one "
"or more of the languages in your document. Use "
"--pdf-renderer auto (the default) to avoid this issue."
)
log.warning(msg)
if ghostscript.version() < '9.20' and options.output_type != 'pdf' and not is_latin:
# https://bugs.ghostscript.com/show_bug.cgi?id=696874
# Ghostscript < 9.20 fails to encode multibyte characters properly
msg = (
"The installed version of Ghostscript does not work correctly "
"with the OCR languages you specified. Use --output-type pdf or "
"upgrade to Ghostscript 9.20 or later to avoid this issue."
)
msg += f"Found Ghostscript {ghostscript.version()}"
log.warning(msg)
# Decide on what renderer to use
if options.pdf_renderer == 'auto':
options.pdf_renderer = 'sandwich'
if options.output_type == 'pdfa':
options.output_type = 'pdfa-2'
if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
raise MissingDependencyError(
"--output-type pdfa-3 requires Ghostscript 9.19 or later"
)
lossless_reconstruction = False
if not any(
(
options.deskew,
options.clean_final,
options.force_ocr,
options.remove_background,
)
):
lossless_reconstruction = True
options.lossless_reconstruction = lossless_reconstruction
if not options.lossless_reconstruction and options.redo_ocr:
raise argparse.ArgumentError(
None,
"--redo-ocr is not currently compatible with --deskew, "
"--clean-final, and --remove-background",
)
def check_options_sidecar(options, log):
if options.sidecar == '\0':
if options.output_file == '-':
raise argparse.ArgumentError(
None,
"--sidecar filename must be specified when output file is " "stdout.",
)
options.sidecar = options.output_file + '.txt'
def check_options_preprocessing(options, log):
if options.clean_final:
options.clean = True
if options.unpaper_args and not options.clean:
raise argparse.ArgumentError(None, "--clean is required for --unpaper-args")
if options.clean:
check_external_program(
log=log,
program='unpaper',
package='unpaper',
version_checker=unpaper.version,
need_version='6.1',
required_for=['--clean, --clean-final'],
)
try:
if options.unpaper_args:
options.unpaper_args = unpaper.validate_custom_args(
options.unpaper_args
)
except Exception as e:
raise argparse.ArgumentError(None, str(e))
def check_options_ocr_behavior(options, log):
exclusive_options = sum(
[
(1 if opt else 0)
for opt in (options.force_ocr, options.skip_text, options.redo_ocr)
]
)
if exclusive_options >= 2:
raise argparse.ArgumentError(
None, "Error: choose only one of --force-ocr, --skip-text, --redo-ocr."
)
def check_options_optimizing(options, log):
if options.optimize >= 2:
check_external_program(
log=log,
program='pngquant',
package='pngquant',
version_checker=pngquant.version,
need_version='2.0.1',
required_for='--optimize {2,3}',
)
if options.optimize >= 2:
# Although we use JBIG2 for optimize=1, don't nag about it unless the
# user is asking for more optimization
check_external_program(
log=log,
program='jbig2',
package='jbig2enc',
version_checker=jbig2enc.version,
need_version='0.28',
required_for='--optimize {2,3} | --jbig2-lossy',
recommended=True if not options.jbig2_lossy else False,
)
if options.optimize == 0 and any(
[options.jbig2_lossy, options.png_quality, options.jpeg_quality]
):
log.warning(
"The arguments --jbig2-lossy, --png-quality, and --jpeg-quality "
"will be ignored because --optimize=0."
)
def check_options_advanced(options, log):
if options.pdfa_image_compression != 'auto' and options.output_type.startswith(
'pdfa'
):
log.warning(
"--pdfa-image-compression argument has no effect when "
"--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
)
if tesseract.v4() and (options.user_words or options.user_patterns):
log.warning('Tesseract 4.x ignores --user-words, so this has no effect')
def check_options_metadata(options, log):
import unicodedata
docinfo = [options.title, options.author, options.keywords, options.subject]
for s in (m for m in docinfo if m):
for c in s:
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
raise ValueError(
"One of the metadata strings contains "
"an unsupported Unicode character: '{}' (U+{})".format(
c, hex(ord(c))[2:].upper()
)
)
def check_options_pillow(options, log):
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
if PIL.Image.MAX_IMAGE_PIXELS == 0:
PIL.Image.MAX_IMAGE_PIXELS = None
def check_options(options, log):
try:
check_options_languages(options, log)
check_options_metadata(options, log)
check_options_output(options, log)
check_options_sidecar(options, log)
check_options_preprocessing(options, log)
check_options_ocr_behavior(options, log)
check_options_optimizing(options, log)
check_options_advanced(options, log)
check_options_pillow(options, log)
except ValueError as e:
log.error(e)
sys.exit(ExitCode.bad_args)
except argparse.ArgumentError as e:
log.error(e)
sys.exit(ExitCode.bad_args)
except MissingDependencyError as e:
log.error(e)
sys.exit(ExitCode.missing_dependency)
# ----------
# Logging
def logging_factory(logger_name, logger_args):
verbose = logger_args['verbose']
quiet = logger_args['quiet']
root_logger = logging.getLogger(logger_name)
root_logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stderr)
formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
handler.setFormatter(formatter_)
if verbose:
handler.setLevel(logging.DEBUG)
elif quiet:
handler.setLevel(logging.WARNING)
else:
handler.setLevel(logging.INFO)
root_logger.addHandler(handler)
return root_logger
def cleanup_ruffus_error_message(msg):
msg = re.sub(r'\s+', r' ', msg)
msg = re.sub(r"\((.+?)\)", r'\1', msg)
msg = msg.strip()
return msg
def do_ruffus_exception(ruffus_five_tuple, options, log):
"""Replace the elaborate ruffus stack trace with a user friendly
description of the error message that occurred."""
exit_code = None
_task_name, _job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
if isinstance(exc_name, type):
# ruffus is full of mystery... sometimes (probably when the process
# group leader is killed) exc_name is the class object of the exception,
# rather than a str. So reach into the object and get its name.
exc_name = exc_name.__name__
if exc_name.startswith('ocrmypdf.exceptions.'):
base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
exit_code = getattr(exc_class, 'exit_code', ExitCode.other_error)
try:
if isinstance(exc_value, exc_class):
exc_msg = str(exc_value)
elif isinstance(exc_value, str):
exc_msg = exc_value
else:
exc_msg = str(exc_class())
except Exception:
exc_msg = "Unknown"
if exc_name in ('builtins.SystemExit', 'SystemExit'):
match = re.search(r"\.(.+?)\)", exc_value)
exit_code_name = match.groups()[0]
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
log.error(cleanup_ruffus_error_message(exc_value))
exit_code = ExitCode.input_file
elif exc_name in ('builtins.KeyboardInterrupt', 'KeyboardInterrupt'):
# We have to print in this case because the log daemon might be toast
print("Interrupted by user", file=sys.stderr)
exit_code = ExitCode.ctrl_c
elif exc_name == 'subprocess.CalledProcessError':
# It's up to the subprocess handler to report something useful
msg = "Error occurred while running this command:"
log.error(msg + '\n' + exc_value)
exit_code = ExitCode.child_process_error
elif exc_name.startswith('ocrmypdf.exceptions.'):
if exc_msg:
log.error(exc_msg)
elif exc_name == 'PIL.Image.DecompressionBombError':
msg = cleanup_ruffus_error_message(exc_value)
msg += (
"\nUse the --max-image-mpixels argument to set increase the "
"maximum number of megapixels to accept."
)
log.error(msg)
exit_code = ExitCode.input_file
if exit_code is not None:
return exit_code
if not options.verbose:
log.error(exc_stack)
return ExitCode.other_error
def traverse_ruffus_exception(exceptions, options, log):
"""Traverse a RethrownJobError and output the exceptions
Ruffus presents exceptions as 5 element tuples. The RethrownJobException
has a list of exceptions like
e.job_exceptions = [(5-tuple), (5-tuple), ...]
ruffus < 2.7.0 had a bug with exception marshalling that would give
different output whether the main or child process raised the exception.
We no longer support this.
Attempting to log the exception itself will re-marshall it to the logger
which is normally running in another process. It's better to avoid re-
marshalling.
The exit code will be based on this, even if multiple exceptions occurred
at the same time."""
exit_codes = []
for exc in exceptions:
exit_code = do_ruffus_exception(exc, options, log)
exit_codes.append(exit_code)
return exit_codes[0] # Multiple codes are rare so take the first one
def check_closed_streams(options):
"""Work around Python issue with multiprocessing forking on closed streams
https://bugs.python.org/issue28326
Attempting to a fork/exec a new Python process when any of std{in,out,err}
are closed or not flushable for some reason may raise an exception.
Fix this by opening devnull if the handle seems to be closed. Do this
globally to avoid tracking places all places that fork.
Seems to be specific to multiprocessing.Process not all Python process
forkers.
The error actually occurs when the stream object is not flushable,
but replacing an open stream object that is not flushable with
/dev/null is a bad idea since it will create a silent failure. Replacing
a closed handle with /dev/null seems safe.
"""
if sys.version_info[0:3] >= (3, 6, 4):
return True # Issued fixed in Python 3.6.4+
if sys.stderr is None:
sys.stderr = open(os.devnull, 'w')
if sys.stdin is None:
if options.input_file == '-':
print("Trying to read from stdin but stdin seems closed", file=sys.stderr)
return False
sys.stdin = open(os.devnull, 'r')
if sys.stdout is None:
if options.output_file == '-':
# Can't replace stdout if the user is piping
# If this case can even happen, it must be some kind of weird
# stream.
print(
textwrap.dedent(
"""\
Output was set to stdout '-' but the stream attached to
stdout does not support the flush() system call. This
will fail."""
),
file=sys.stderr,
)
return False
sys.stdout = open(os.devnull, 'w')
return True
def log_page_orientations(pdfinfo, _log):
direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'}
orientations = []
for n, page in enumerate(pdfinfo):
angle = page.rotation or 0
if angle != 0:
orientations.append('{0}{1}'.format(n + 1, direction.get(angle, '')))
if orientations:
_log.info('Page orientations detected: ' + ' '.join(orientations))
def preamble(_log):
_log.debug('ocrmypdf ' + VERSION)
def check_environ(options, _log):
old_envvars = (
'OCRMYPDF_TESSERACT',
'OCRMYPDF_QPDF',
'OCRMYPDF_GS',
'OCRMYPDF_UNPAPER',
)
for k in old_envvars:
if k in os.environ:
_log.warning(
textwrap.dedent(
f"""\
OCRmyPDF no longer uses the environment variable {k}.
Change PATH to select alternate programs."""
)
)
def check_input_file(options, _log, start_input_file):
if options.input_file == '-':
# stdin
_log.info('reading file from standard input')
with open(start_input_file, 'wb') as stream_buffer:
from shutil import copyfileobj
copyfileobj(sys.stdin.buffer, stream_buffer)
else:
try:
re_symlink(options.input_file, start_input_file, _log)
except FileNotFoundError:
_log.error("File not found - " + options.input_file)
raise InputFileError()
def check_requested_output_file(options, _log):
if options.output_file == '-':
if sys.stdout.isatty():
_log.error(
textwrap.dedent(
"""\
Output was set to stdout '-' but it looks like stdout
is connected to a terminal. Please redirect stdout to a
file."""
)
)
raise BadArgsError()
elif not is_file_writable(options.output_file):
_log.error(
"Output file location ("
+ options.output_file
+ ") "
+ "is not a writable file."
)
raise OutputFileAccessError()
def report_output_file_size(options, _log, input_file, output_file):
try:
output_size = Path(output_file).stat().st_size
input_size = Path(input_file).stat().st_size
except FileNotFoundError:
return # Outputting to stream or something
ratio = output_size / input_size
if ratio < 1.35 or input_size < 25000:
return # Seems fine
reasons = []
image_preproc = {
'deskew',
'clean_final',
'remove_background',
'oversample',
'force_ocr',
}
for arg in image_preproc:
attr = getattr(options, arg, None)
if not attr:
continue
reasons.append(
f"The argument --{arg.replace('_', '-')} was issued, causing transcoding."
)
if reasons:
explanation = "Possible reasons for this include:\n" + '\n'.join(reasons) + "\n"
else:
explanation = "No reason for this increase is known. Please report this issue."
_log.warning(
textwrap.dedent(
f"""\
The output file size is {ratio:.2f}× larger than the input file.
{explanation}
"""
)
)
def check_dependency_versions(options, log):
check_external_program(
log=log,
program='tesseract',
package={'darwin': 'tesseract', 'linux': 'tesseract-ocr'},
version_checker=tesseract.version,
need_version='4.0.0', # using backport for Travis CI
)
check_external_program(
log=log,
program='gs',
package='ghostscript',
version_checker=ghostscript.version,
need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports
)
if ghostscript.version() == '9.24':
complain(
"Ghostscript 9.24 contains serious regressions and is not "
"supported. Please upgrade to Ghostscript 9.25 or use an older "
"version."
)
return ExitCode.missing_dependency
check_external_program(
log=log,
program='qpdf',
package='qpdf',
version_checker=qpdf.version,
need_version='8.0.2',
)
def run_pipeline(args=None):
def run(args=None):
options = parser.parse_args(args=args)
options.verbose_abbreviated_path = 1
if os.environ.get('_OCRMYPDF_THREADS'):
options.use_threads = True
if not check_closed_streams(options):
return ExitCode.bad_args
logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
_log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
logging_factory, __name__, logger_args
)
preamble(_log)
check_options(options, _log)
check_dependency_versions(options, _log)
# Any changes to options will not take effect for options that are already
# bound to function parameters in the pipeline. (For example
# options.input_file, options.pdf_renderer are already bound.)
if not options.jobs:
options.jobs = available_cpu_count()
# Performance is improved by setting Tesseract to single threaded. In tests
# this gives better throughput than letting a smaller number of Tesseract
# jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
# variable, but harmless to set if ignored.
os.environ.setdefault('OMP_THREAD_LIMIT', '1')
check_environ(options, _log)
if os.environ.get('PYTEST_CURRENT_TEST'):
os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
try:
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
start_input_file = os.path.join(work_folder, 'origin')
check_input_file(options, _log, start_input_file)
check_requested_output_file(options, _log)
manager = JobContextManager()
manager.register('JobContext', JobContext) # pylint: disable=no-member
manager.start()
context = manager.JobContext() # pylint: disable=no-member
context.set_options(options)
context.set_work_folder(work_folder)
build_pipeline(options, work_folder, _log, context)
atexit.register(cleanup_working_files, work_folder, options)
if hasattr(os, 'nice'):
os.nice(5)
cmdline.run(options)
except ruffus_exceptions.RethrownJobError as e:
if options.verbose:
_log.debug(str(e)) # stringify exception so logger doesn't have to
exceptions = e.job_exceptions
exitcode = traverse_ruffus_exception(exceptions, options, _log)
if exitcode is None:
_log.error("Unexpected ruffus exception: " + str(e))
_log.error(repr(e))
return ExitCode.other_error
return exitcode
except ExitCodeException as e:
return e.exit_code
except Exception as e:
_log.error(str(e))
return ExitCode.other_error
if options.flowchart:
_log.info(f"Flowchart saved to {options.flowchart}")
return ExitCode.ok
elif options.output_file == '-':
_log.info("Output sent to stdout")
elif os.path.samefile(options.output_file, os.devnull):
pass # Say nothing when sending to dev null
else:
if options.output_type.startswith('pdfa'):
pdfa_info = file_claims_pdfa(options.output_file)
if pdfa_info['pass']:
msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
_log.info(msg)
else:
msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
_log.warning(msg)
return ExitCode.pdfa_conversion_failed
if not qpdf.check(options.output_file, _log):
_log.warning('Output file: The generated PDF is INVALID')
return ExitCode.invalid_output_pdf
report_output_file_size(options, _log, start_input_file, options.output_file)
pdfinfo = context.get_pdfinfo()
if options.verbose:
from pprint import pformat
_log.debug(pformat(pdfinfo))
log_page_orientations(pdfinfo, _log)
return ExitCode.ok
return run_pipeline(options)
if __name__ == '__main__':
sys.exit(run_pipeline())
sys.exit(run())

702
src/ocrmypdf/run.py Normal file
View File

@@ -0,0 +1,702 @@
#!/usr/bin/env python3
# © 2015-17 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import atexit
import logging
import os
import re
import sys
import textwrap
from pathlib import Path
from tempfile import mkdtemp
import PIL
import ruffus.cmdline as cmdline
import ruffus.proxy_logger as proxy_logger
import ruffus.ruffus_exceptions as ruffus_exceptions
from . import VERSION
from . import exceptions as ocrmypdf_exceptions
from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
from ._pipeline import build_pipeline
from ._unicodefun import verify_python3_env
from .exceptions import (
BadArgsError,
ExitCode,
ExitCodeException,
InputFileError,
MissingDependencyError,
OutputFileAccessError,
)
from .exec import (
ghostscript,
jbig2enc,
qpdf,
tesseract,
check_external_program,
unpaper,
pngquant,
)
from .helpers import available_cpu_count, is_file_writable, re_symlink
from .pdfa import file_claims_pdfa
# -------------
# External dependencies
HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
def complain(message):
print(*textwrap.wrap(message), file=sys.stderr)
# --------
# Critical environment tests
verify_python3_env()
def check_options_languages(options, _log):
if not options.language:
options.language = ['eng'] # Enforce English hegemony
# Support v2.x "eng+deu" language syntax
if '+' in options.language[0]:
options.language = options.language[0].split('+')
languages = set(options.language)
if not languages.issubset(tesseract.languages()):
msg = (
"The installed version of tesseract does not have language "
"data for the following requested languages: \n"
)
for lang in languages - tesseract.languages():
msg += lang + '\n'
raise MissingDependencyError(msg)
def check_options_output(options, log):
# We have these constraints to check for.
# 1. Ghostscript < 9.20 mangles multibyte Unicode
# 2. hocr doesn't work on non-Latin languages (so don't select it)
languages = set(options.language)
is_latin = languages.issubset(HOCR_OK_LANGS)
if options.pdf_renderer == 'hocr' and not is_latin:
msg = (
"The 'hocr' PDF renderer is known to cause problems with one "
"or more of the languages in your document. Use "
"--pdf-renderer auto (the default) to avoid this issue."
)
log.warning(msg)
if ghostscript.version() < '9.20' and options.output_type != 'pdf' and not is_latin:
# https://bugs.ghostscript.com/show_bug.cgi?id=696874
# Ghostscript < 9.20 fails to encode multibyte characters properly
msg = (
"The installed version of Ghostscript does not work correctly "
"with the OCR languages you specified. Use --output-type pdf or "
"upgrade to Ghostscript 9.20 or later to avoid this issue."
)
msg += f"Found Ghostscript {ghostscript.version()}"
log.warning(msg)
# Decide on what renderer to use
if options.pdf_renderer == 'auto':
options.pdf_renderer = 'sandwich'
if options.output_type == 'pdfa':
options.output_type = 'pdfa-2'
if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
raise MissingDependencyError(
"--output-type pdfa-3 requires Ghostscript 9.19 or later"
)
lossless_reconstruction = False
if not any(
(
options.deskew,
options.clean_final,
options.force_ocr,
options.remove_background,
)
):
lossless_reconstruction = True
options.lossless_reconstruction = lossless_reconstruction
if not options.lossless_reconstruction and options.redo_ocr:
raise BadArgsError(
"--redo-ocr is not currently compatible with --deskew, "
"--clean-final, and --remove-background",
)
def check_options_sidecar(options, log):
if options.sidecar == '\0':
if options.output_file == '-':
raise BadArgsError(
"--sidecar filename must be specified when output file is " "stdout.",
)
options.sidecar = options.output_file + '.txt'
def check_options_preprocessing(options, log):
if options.clean_final:
options.clean = True
if options.unpaper_args and not options.clean:
raise BadArgsError("--clean is required for --unpaper-args")
if options.clean:
check_external_program(
log=log,
program='unpaper',
package='unpaper',
version_checker=unpaper.version,
need_version='6.1',
required_for=['--clean, --clean-final'],
)
try:
if options.unpaper_args:
options.unpaper_args = unpaper.validate_custom_args(
options.unpaper_args
)
except Exception as e:
raise BadArgsError(str(e))
def check_options_ocr_behavior(options, log):
exclusive_options = sum(
[
(1 if opt else 0)
for opt in (options.force_ocr, options.skip_text, options.redo_ocr)
]
)
if exclusive_options >= 2:
raise BadArgsError(
"Error: choose only one of --force-ocr, --skip-text, --redo-ocr."
)
def check_options_optimizing(options, log):
if options.optimize >= 2:
check_external_program(
log=log,
program='pngquant',
package='pngquant',
version_checker=pngquant.version,
need_version='2.0.1',
required_for='--optimize {2,3}',
)
if options.optimize >= 2:
# Although we use JBIG2 for optimize=1, don't nag about it unless the
# user is asking for more optimization
check_external_program(
log=log,
program='jbig2',
package='jbig2enc',
version_checker=jbig2enc.version,
need_version='0.28',
required_for='--optimize {2,3} | --jbig2-lossy',
recommended=True if not options.jbig2_lossy else False,
)
if options.optimize == 0 and any(
[options.jbig2_lossy, options.png_quality, options.jpeg_quality]
):
log.warning(
"The arguments --jbig2-lossy, --png-quality, and --jpeg-quality "
"will be ignored because --optimize=0."
)
def check_options_advanced(options, log):
if options.pdfa_image_compression != 'auto' and options.output_type.startswith(
'pdfa'
):
log.warning(
"--pdfa-image-compression argument has no effect when "
"--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
)
if tesseract.v4() and (options.user_words or options.user_patterns):
log.warning('Tesseract 4.x ignores --user-words, so this has no effect')
def check_options_metadata(options, log):
import unicodedata
docinfo = [options.title, options.author, options.keywords, options.subject]
for s in (m for m in docinfo if m):
for c in s:
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
raise ValueError(
"One of the metadata strings contains "
"an unsupported Unicode character: '{}' (U+{})".format(
c, hex(ord(c))[2:].upper()
)
)
def check_options_pillow(options, log):
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
if PIL.Image.MAX_IMAGE_PIXELS == 0:
PIL.Image.MAX_IMAGE_PIXELS = None
def check_options(options, log):
try:
check_options_languages(options, log)
check_options_metadata(options, log)
check_options_output(options, log)
check_options_sidecar(options, log)
check_options_preprocessing(options, log)
check_options_ocr_behavior(options, log)
check_options_optimizing(options, log)
check_options_advanced(options, log)
check_options_pillow(options, log)
return ExitCode.ok
except ValueError as e:
log.error(e)
return ExitCode.bad_args
except BadArgsError as e:
log.error(e)
return e.exit_code
except MissingDependencyError as e:
log.error(e)
return ExitCode.missing_dependency
# ----------
# Logging
def logging_factory(logger_name, logger_args):
verbose = logger_args['verbose']
quiet = logger_args['quiet']
root_logger = logging.getLogger(logger_name)
root_logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler(sys.stderr)
formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
handler.setFormatter(formatter_)
if verbose:
handler.setLevel(logging.DEBUG)
elif quiet:
handler.setLevel(logging.WARNING)
else:
handler.setLevel(logging.INFO)
root_logger.addHandler(handler)
return root_logger
def cleanup_ruffus_error_message(msg):
msg = re.sub(r'\s+', r' ', msg)
msg = re.sub(r"\((.+?)\)", r'\1', msg)
msg = msg.strip()
return msg
def do_ruffus_exception(ruffus_five_tuple, options, log):
"""Replace the elaborate ruffus stack trace with a user friendly
description of the error message that occurred."""
exit_code = None
_task_name, _job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
if isinstance(exc_name, type):
# ruffus is full of mystery... sometimes (probably when the process
# group leader is killed) exc_name is the class object of the exception,
# rather than a str. So reach into the object and get its name.
exc_name = exc_name.__name__
if exc_name.startswith('ocrmypdf.exceptions.'):
base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
exit_code = getattr(exc_class, 'exit_code', ExitCode.other_error)
try:
if isinstance(exc_value, exc_class):
exc_msg = str(exc_value)
elif isinstance(exc_value, str):
exc_msg = exc_value
else:
exc_msg = str(exc_class())
except Exception:
exc_msg = "Unknown"
if exc_name in ('builtins.SystemExit', 'SystemExit'):
match = re.search(r"\.(.+?)\)", exc_value)
exit_code_name = match.groups()[0]
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
log.error(cleanup_ruffus_error_message(exc_value))
exit_code = ExitCode.input_file
elif exc_name in ('builtins.KeyboardInterrupt', 'KeyboardInterrupt'):
# We have to print in this case because the log daemon might be toast
print("Interrupted by user", file=sys.stderr)
exit_code = ExitCode.ctrl_c
elif exc_name == 'subprocess.CalledProcessError':
# It's up to the subprocess handler to report something useful
msg = "Error occurred while running this command:"
log.error(msg + '\n' + exc_value)
exit_code = ExitCode.child_process_error
elif exc_name.startswith('ocrmypdf.exceptions.'):
if exc_msg:
log.error(exc_msg)
elif exc_name == 'PIL.Image.DecompressionBombError':
msg = cleanup_ruffus_error_message(exc_value)
msg += (
"\nUse the --max-image-mpixels argument to set increase the "
"maximum number of megapixels to accept."
)
log.error(msg)
exit_code = ExitCode.input_file
if exit_code is not None:
return exit_code
if not options.verbose:
log.error(exc_stack)
return ExitCode.other_error
def traverse_ruffus_exception(exceptions, options, log):
"""Traverse a RethrownJobError and output the exceptions
Ruffus presents exceptions as 5 element tuples. The RethrownJobException
has a list of exceptions like
e.job_exceptions = [(5-tuple), (5-tuple), ...]
ruffus < 2.7.0 had a bug with exception marshalling that would give
different output whether the main or child process raised the exception.
We no longer support this.
Attempting to log the exception itself will re-marshall it to the logger
which is normally running in another process. It's better to avoid re-
marshalling.
The exit code will be based on this, even if multiple exceptions occurred
at the same time."""
exit_codes = []
for exc in exceptions:
exit_code = do_ruffus_exception(exc, options, log)
exit_codes.append(exit_code)
return exit_codes[0] # Multiple codes are rare so take the first one
def check_closed_streams(options):
"""Work around Python issue with multiprocessing forking on closed streams
https://bugs.python.org/issue28326
Attempting to a fork/exec a new Python process when any of std{in,out,err}
are closed or not flushable for some reason may raise an exception.
Fix this by opening devnull if the handle seems to be closed. Do this
globally to avoid tracking places all places that fork.
Seems to be specific to multiprocessing.Process not all Python process
forkers.
The error actually occurs when the stream object is not flushable,
but replacing an open stream object that is not flushable with
/dev/null is a bad idea since it will create a silent failure. Replacing
a closed handle with /dev/null seems safe.
"""
if sys.version_info[0:3] >= (3, 6, 4):
return True # Issued fixed in Python 3.6.4+
if sys.stderr is None:
sys.stderr = open(os.devnull, 'w')
if sys.stdin is None:
if options.input_file == '-':
print("Trying to read from stdin but stdin seems closed", file=sys.stderr)
return False
sys.stdin = open(os.devnull, 'r')
if sys.stdout is None:
if options.output_file == '-':
# Can't replace stdout if the user is piping
# If this case can even happen, it must be some kind of weird
# stream.
print(
textwrap.dedent(
"""\
Output was set to stdout '-' but the stream attached to
stdout does not support the flush() system call. This
will fail."""
),
file=sys.stderr,
)
return False
sys.stdout = open(os.devnull, 'w')
return True
def log_page_orientations(pdfinfo, _log):
direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'}
orientations = []
for n, page in enumerate(pdfinfo):
angle = page.rotation or 0
if angle != 0:
orientations.append('{0}{1}'.format(n + 1, direction.get(angle, '')))
if orientations:
_log.info('Page orientations detected: ' + ' '.join(orientations))
def preamble(_log):
_log.debug('ocrmypdf ' + VERSION)
def check_environ(options, _log):
old_envvars = (
'OCRMYPDF_TESSERACT',
'OCRMYPDF_QPDF',
'OCRMYPDF_GS',
'OCRMYPDF_UNPAPER',
)
for k in old_envvars:
if k in os.environ:
_log.warning(
textwrap.dedent(
f"""\
OCRmyPDF no longer uses the environment variable {k}.
Change PATH to select alternate programs."""
)
)
def check_input_file(options, _log, start_input_file):
if options.input_file == '-':
# stdin
_log.info('reading file from standard input')
with open(start_input_file, 'wb') as stream_buffer:
from shutil import copyfileobj
copyfileobj(sys.stdin.buffer, stream_buffer)
else:
try:
re_symlink(options.input_file, start_input_file, _log)
except FileNotFoundError:
_log.error("File not found - " + options.input_file)
raise InputFileError()
def check_requested_output_file(options, _log):
if options.output_file == '-':
if sys.stdout.isatty():
_log.error(
textwrap.dedent(
"""\
Output was set to stdout '-' but it looks like stdout
is connected to a terminal. Please redirect stdout to a
file."""
)
)
raise BadArgsError()
elif not is_file_writable(options.output_file):
_log.error(
"Output file location ("
+ options.output_file
+ ") "
+ "is not a writable file."
)
raise OutputFileAccessError()
def report_output_file_size(options, _log, input_file, output_file):
try:
output_size = Path(output_file).stat().st_size
input_size = Path(input_file).stat().st_size
except FileNotFoundError:
return # Outputting to stream or something
ratio = output_size / input_size
if ratio < 1.35 or input_size < 25000:
return # Seems fine
reasons = []
image_preproc = {
'deskew',
'clean_final',
'remove_background',
'oversample',
'force_ocr',
}
for arg in image_preproc:
attr = getattr(options, arg, None)
if not attr:
continue
reasons.append(
f"The argument --{arg.replace('_', '-')} was issued, causing transcoding."
)
if reasons:
explanation = "Possible reasons for this include:\n" + '\n'.join(reasons) + "\n"
else:
explanation = "No reason for this increase is known. Please report this issue."
_log.warning(
textwrap.dedent(
f"""\
The output file size is {ratio:.2f}× larger than the input file.
{explanation}
"""
)
)
def check_dependency_versions(options, log):
check_external_program(
log=log,
program='tesseract',
package={'darwin': 'tesseract', 'linux': 'tesseract-ocr'},
version_checker=tesseract.version,
need_version='4.0.0', # using backport for Travis CI
)
check_external_program(
log=log,
program='gs',
package='ghostscript',
version_checker=ghostscript.version,
need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports
)
if ghostscript.version() == '9.24':
complain(
"Ghostscript 9.24 contains serious regressions and is not "
"supported. Please upgrade to Ghostscript 9.25 or use an older "
"version."
)
return ExitCode.missing_dependency
check_external_program(
log=log,
program='qpdf',
package='qpdf',
version_checker=qpdf.version,
need_version='8.0.2',
)
def run_pipeline(options):
options.verbose_abbreviated_path = 1
if os.environ.get('_OCRMYPDF_THREADS'):
options.use_threads = True
if not check_closed_streams(options):
return ExitCode.bad_args
logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
_log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
logging_factory, __name__, logger_args
)
preamble(_log)
check_code = check_options(options, _log)
if check_code != ExitCode.ok:
return check_code
check_dependency_versions(options, _log)
# Any changes to options will not take effect for options that are already
# bound to function parameters in the pipeline. (For example
# options.input_file, options.pdf_renderer are already bound.)
if not options.jobs:
options.jobs = available_cpu_count()
# Performance is improved by setting Tesseract to single threaded. In tests
# this gives better throughput than letting a smaller number of Tesseract
# jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
# variable, but harmless to set if ignored.
os.environ.setdefault('OMP_THREAD_LIMIT', '1')
check_environ(options, _log)
if os.environ.get('PYTEST_CURRENT_TEST'):
os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
try:
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
start_input_file = os.path.join(work_folder, 'origin')
check_input_file(options, _log, start_input_file)
check_requested_output_file(options, _log)
manager = JobContextManager()
manager.register('JobContext', JobContext) # pylint: disable=no-member
manager.start()
context = manager.JobContext() # pylint: disable=no-member
context.set_options(options)
context.set_work_folder(work_folder)
build_pipeline(options, work_folder, _log, context)
atexit.register(cleanup_working_files, work_folder, options)
if hasattr(os, 'nice'):
os.nice(5)
cmdline.run(options)
except ruffus_exceptions.RethrownJobError as e:
if options.verbose:
_log.debug(str(e)) # stringify exception so logger doesn't have to
exceptions = e.job_exceptions
exitcode = traverse_ruffus_exception(exceptions, options, _log)
if exitcode is None:
_log.error("Unexpected ruffus exception: " + str(e))
_log.error(repr(e))
return ExitCode.other_error
return exitcode
except ExitCodeException as e:
return e.exit_code
except Exception as e:
_log.error(str(e))
return ExitCode.other_error
if options.flowchart:
_log.info(f"Flowchart saved to {options.flowchart}")
return ExitCode.ok
elif options.output_file == '-':
_log.info("Output sent to stdout")
elif os.path.samefile(options.output_file, os.devnull):
pass # Say nothing when sending to dev null
else:
if options.output_type.startswith('pdfa'):
pdfa_info = file_claims_pdfa(options.output_file)
if pdfa_info['pass']:
msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
_log.info(msg)
else:
msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
_log.warning(msg)
return ExitCode.pdfa_conversion_failed
if not qpdf.check(options.output_file, _log):
_log.warning('Output file: The generated PDF is INVALID')
return ExitCode.invalid_output_pdf
report_output_file_size(options, _log, start_input_file, options.output_file)
pdfinfo = context.get_pdfinfo()
if options.verbose:
from pprint import pformat
_log.debug(pformat(pdfinfo))
log_page_orientations(pdfinfo, _log)
return ExitCode.ok