diff --git a/setup.py b/setup.py
index 560434f8..74aa1039 100644
--- a/setup.py
+++ b/setup.py
@@ -108,7 +108,7 @@ setup(
],
extras_require={'pdfminer': ['pdfminer.six == 20181108']},
tests_require=tests_require,
- entry_points={'console_scripts': ['ocrmypdf = ocrmypdf.__main__:run_pipeline']},
+ entry_points={'console_scripts': ['ocrmypdf = ocrmypdf.__main__:run']},
package_data={'ocrmypdf': ['data/sRGB.icc']},
include_package_data=True,
zip_safe=False,
diff --git a/src/ocrmypdf/__init__.py b/src/ocrmypdf/__init__.py
index a37d2658..bc2e6791 100644
--- a/src/ocrmypdf/__init__.py
+++ b/src/ocrmypdf/__init__.py
@@ -44,3 +44,4 @@ from . import hocrtransform
from . import leptonica
from . import pdfa
from . import pdfinfo
+from .run import run_pipeline as run
diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py
index 3f2042df..9a7b1b66 100755
--- a/src/ocrmypdf/__main__.py
+++ b/src/ocrmypdf/__main__.py
@@ -17,68 +17,19 @@
# along with OCRmyPDF. If not, see .
import argparse
-import atexit
-import logging
import os
-import re
import sys
-import textwrap
-from pathlib import Path
-from tempfile import mkdtemp
-
-import PIL
-import ruffus.cmdline as cmdline
-import ruffus.proxy_logger as proxy_logger
-import ruffus.ruffus_exceptions as ruffus_exceptions
from . import PROGRAM_NAME, VERSION
-from . import exceptions as ocrmypdf_exceptions
-from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
-from ._pipeline import build_pipeline
-from ._unicodefun import verify_python3_env
-from .exceptions import (
- BadArgsError,
- ExitCode,
- ExitCodeException,
- InputFileError,
- MissingDependencyError,
- OutputFileAccessError,
-)
-from .exec import (
- ghostscript,
- jbig2enc,
- qpdf,
- tesseract,
- check_external_program,
- unpaper,
- pngquant,
-)
-from .helpers import available_cpu_count, is_file_writable, re_symlink
-from .pdfa import file_claims_pdfa
-
-# -------------
-# External dependencies
-
-HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
-
-
-def complain(message):
- print(*textwrap.wrap(message), file=sys.stderr)
-
+from .run import run_pipeline
# Hack to help debugger context find /usr/local/bin
if 'IDE_PROJECT_ROOTS' in os.environ:
os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
-# --------
-# Critical environment tests
-
-verify_python3_env()
-
# -------------
# Parser
-
def numeric(basetype, min_=None, max_=None):
"""Validator for numeric params"""
min_ = basetype(min_) if min_ is not None else None
@@ -514,638 +465,9 @@ debugging.add_argument(
'--flowchart', type=str, help="Generate the pipeline execution flowchart"
)
-
-def check_options_languages(options, _log):
- if not options.language:
- options.language = ['eng'] # Enforce English hegemony
-
- # Support v2.x "eng+deu" language syntax
- if '+' in options.language[0]:
- options.language = options.language[0].split('+')
-
- languages = set(options.language)
- if not languages.issubset(tesseract.languages()):
- msg = (
- "The installed version of tesseract does not have language "
- "data for the following requested languages: \n"
- )
- for lang in languages - tesseract.languages():
- msg += lang + '\n'
- raise MissingDependencyError(msg)
-
-
-def check_options_output(options, log):
- # We have these constraints to check for.
- # 1. Ghostscript < 9.20 mangles multibyte Unicode
- # 2. hocr doesn't work on non-Latin languages (so don't select it)
-
- languages = set(options.language)
- is_latin = languages.issubset(HOCR_OK_LANGS)
-
- if options.pdf_renderer == 'hocr' and not is_latin:
- msg = (
- "The 'hocr' PDF renderer is known to cause problems with one "
- "or more of the languages in your document. Use "
- "--pdf-renderer auto (the default) to avoid this issue."
- )
- log.warning(msg)
-
- if ghostscript.version() < '9.20' and options.output_type != 'pdf' and not is_latin:
- # https://bugs.ghostscript.com/show_bug.cgi?id=696874
- # Ghostscript < 9.20 fails to encode multibyte characters properly
- msg = (
- "The installed version of Ghostscript does not work correctly "
- "with the OCR languages you specified. Use --output-type pdf or "
- "upgrade to Ghostscript 9.20 or later to avoid this issue."
- )
- msg += f"Found Ghostscript {ghostscript.version()}"
- log.warning(msg)
-
- # Decide on what renderer to use
- if options.pdf_renderer == 'auto':
- options.pdf_renderer = 'sandwich'
-
- if options.output_type == 'pdfa':
- options.output_type = 'pdfa-2'
-
- if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
- raise MissingDependencyError(
- "--output-type pdfa-3 requires Ghostscript 9.19 or later"
- )
-
- lossless_reconstruction = False
- if not any(
- (
- options.deskew,
- options.clean_final,
- options.force_ocr,
- options.remove_background,
- )
- ):
- lossless_reconstruction = True
- options.lossless_reconstruction = lossless_reconstruction
-
- if not options.lossless_reconstruction and options.redo_ocr:
- raise argparse.ArgumentError(
- None,
- "--redo-ocr is not currently compatible with --deskew, "
- "--clean-final, and --remove-background",
- )
-
-
-def check_options_sidecar(options, log):
- if options.sidecar == '\0':
- if options.output_file == '-':
- raise argparse.ArgumentError(
- None,
- "--sidecar filename must be specified when output file is " "stdout.",
- )
- options.sidecar = options.output_file + '.txt'
-
-
-def check_options_preprocessing(options, log):
- if options.clean_final:
- options.clean = True
- if options.unpaper_args and not options.clean:
- raise argparse.ArgumentError(None, "--clean is required for --unpaper-args")
- if options.clean:
- check_external_program(
- log=log,
- program='unpaper',
- package='unpaper',
- version_checker=unpaper.version,
- need_version='6.1',
- required_for=['--clean, --clean-final'],
- )
- try:
- if options.unpaper_args:
- options.unpaper_args = unpaper.validate_custom_args(
- options.unpaper_args
- )
- except Exception as e:
- raise argparse.ArgumentError(None, str(e))
-
-
-def check_options_ocr_behavior(options, log):
- exclusive_options = sum(
- [
- (1 if opt else 0)
- for opt in (options.force_ocr, options.skip_text, options.redo_ocr)
- ]
- )
- if exclusive_options >= 2:
- raise argparse.ArgumentError(
- None, "Error: choose only one of --force-ocr, --skip-text, --redo-ocr."
- )
-
-
-def check_options_optimizing(options, log):
- if options.optimize >= 2:
- check_external_program(
- log=log,
- program='pngquant',
- package='pngquant',
- version_checker=pngquant.version,
- need_version='2.0.1',
- required_for='--optimize {2,3}',
- )
-
- if options.optimize >= 2:
- # Although we use JBIG2 for optimize=1, don't nag about it unless the
- # user is asking for more optimization
- check_external_program(
- log=log,
- program='jbig2',
- package='jbig2enc',
- version_checker=jbig2enc.version,
- need_version='0.28',
- required_for='--optimize {2,3} | --jbig2-lossy',
- recommended=True if not options.jbig2_lossy else False,
- )
-
- if options.optimize == 0 and any(
- [options.jbig2_lossy, options.png_quality, options.jpeg_quality]
- ):
- log.warning(
- "The arguments --jbig2-lossy, --png-quality, and --jpeg-quality "
- "will be ignored because --optimize=0."
- )
-
-
-def check_options_advanced(options, log):
- if options.pdfa_image_compression != 'auto' and options.output_type.startswith(
- 'pdfa'
- ):
- log.warning(
- "--pdfa-image-compression argument has no effect when "
- "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
- )
- if tesseract.v4() and (options.user_words or options.user_patterns):
- log.warning('Tesseract 4.x ignores --user-words, so this has no effect')
-
-
-def check_options_metadata(options, log):
- import unicodedata
-
- docinfo = [options.title, options.author, options.keywords, options.subject]
- for s in (m for m in docinfo if m):
- for c in s:
- if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
- raise ValueError(
- "One of the metadata strings contains "
- "an unsupported Unicode character: '{}' (U+{})".format(
- c, hex(ord(c))[2:].upper()
- )
- )
-
-
-def check_options_pillow(options, log):
- PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
- if PIL.Image.MAX_IMAGE_PIXELS == 0:
- PIL.Image.MAX_IMAGE_PIXELS = None
-
-
-def check_options(options, log):
- try:
- check_options_languages(options, log)
- check_options_metadata(options, log)
- check_options_output(options, log)
- check_options_sidecar(options, log)
- check_options_preprocessing(options, log)
- check_options_ocr_behavior(options, log)
- check_options_optimizing(options, log)
- check_options_advanced(options, log)
- check_options_pillow(options, log)
- except ValueError as e:
- log.error(e)
- sys.exit(ExitCode.bad_args)
- except argparse.ArgumentError as e:
- log.error(e)
- sys.exit(ExitCode.bad_args)
- except MissingDependencyError as e:
- log.error(e)
- sys.exit(ExitCode.missing_dependency)
-
-
-# ----------
-# Logging
-
-
-def logging_factory(logger_name, logger_args):
- verbose = logger_args['verbose']
- quiet = logger_args['quiet']
-
- root_logger = logging.getLogger(logger_name)
- root_logger.setLevel(logging.DEBUG)
-
- handler = logging.StreamHandler(sys.stderr)
- formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
- handler.setFormatter(formatter_)
- if verbose:
- handler.setLevel(logging.DEBUG)
- elif quiet:
- handler.setLevel(logging.WARNING)
- else:
- handler.setLevel(logging.INFO)
- root_logger.addHandler(handler)
- return root_logger
-
-
-def cleanup_ruffus_error_message(msg):
- msg = re.sub(r'\s+', r' ', msg)
- msg = re.sub(r"\((.+?)\)", r'\1', msg)
- msg = msg.strip()
- return msg
-
-
-def do_ruffus_exception(ruffus_five_tuple, options, log):
- """Replace the elaborate ruffus stack trace with a user friendly
- description of the error message that occurred."""
- exit_code = None
-
- _task_name, _job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
-
- if isinstance(exc_name, type):
- # ruffus is full of mystery... sometimes (probably when the process
- # group leader is killed) exc_name is the class object of the exception,
- # rather than a str. So reach into the object and get its name.
- exc_name = exc_name.__name__
-
- if exc_name.startswith('ocrmypdf.exceptions.'):
- base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
- exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
- exit_code = getattr(exc_class, 'exit_code', ExitCode.other_error)
- try:
- if isinstance(exc_value, exc_class):
- exc_msg = str(exc_value)
- elif isinstance(exc_value, str):
- exc_msg = exc_value
- else:
- exc_msg = str(exc_class())
- except Exception:
- exc_msg = "Unknown"
-
- if exc_name in ('builtins.SystemExit', 'SystemExit'):
- match = re.search(r"\.(.+?)\)", exc_value)
- exit_code_name = match.groups()[0]
- exit_code = getattr(ExitCode, exit_code_name, 'other_error')
- elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
- log.error(cleanup_ruffus_error_message(exc_value))
- exit_code = ExitCode.input_file
- elif exc_name in ('builtins.KeyboardInterrupt', 'KeyboardInterrupt'):
- # We have to print in this case because the log daemon might be toast
- print("Interrupted by user", file=sys.stderr)
- exit_code = ExitCode.ctrl_c
- elif exc_name == 'subprocess.CalledProcessError':
- # It's up to the subprocess handler to report something useful
- msg = "Error occurred while running this command:"
- log.error(msg + '\n' + exc_value)
- exit_code = ExitCode.child_process_error
- elif exc_name.startswith('ocrmypdf.exceptions.'):
- if exc_msg:
- log.error(exc_msg)
- elif exc_name == 'PIL.Image.DecompressionBombError':
- msg = cleanup_ruffus_error_message(exc_value)
- msg += (
- "\nUse the --max-image-mpixels argument to set increase the "
- "maximum number of megapixels to accept."
- )
- log.error(msg)
- exit_code = ExitCode.input_file
-
- if exit_code is not None:
- return exit_code
-
- if not options.verbose:
- log.error(exc_stack)
- return ExitCode.other_error
-
-
-def traverse_ruffus_exception(exceptions, options, log):
- """Traverse a RethrownJobError and output the exceptions
-
- Ruffus presents exceptions as 5 element tuples. The RethrownJobException
- has a list of exceptions like
- e.job_exceptions = [(5-tuple), (5-tuple), ...]
-
- ruffus < 2.7.0 had a bug with exception marshalling that would give
- different output whether the main or child process raised the exception.
- We no longer support this.
-
- Attempting to log the exception itself will re-marshall it to the logger
- which is normally running in another process. It's better to avoid re-
- marshalling.
-
- The exit code will be based on this, even if multiple exceptions occurred
- at the same time."""
-
- exit_codes = []
- for exc in exceptions:
- exit_code = do_ruffus_exception(exc, options, log)
- exit_codes.append(exit_code)
-
- return exit_codes[0] # Multiple codes are rare so take the first one
-
-
-def check_closed_streams(options):
- """Work around Python issue with multiprocessing forking on closed streams
-
- https://bugs.python.org/issue28326
-
- Attempting to a fork/exec a new Python process when any of std{in,out,err}
- are closed or not flushable for some reason may raise an exception.
- Fix this by opening devnull if the handle seems to be closed. Do this
- globally to avoid tracking places all places that fork.
-
- Seems to be specific to multiprocessing.Process not all Python process
- forkers.
-
- The error actually occurs when the stream object is not flushable,
- but replacing an open stream object that is not flushable with
- /dev/null is a bad idea since it will create a silent failure. Replacing
- a closed handle with /dev/null seems safe.
-
- """
-
- if sys.version_info[0:3] >= (3, 6, 4):
- return True # Issued fixed in Python 3.6.4+
-
- if sys.stderr is None:
- sys.stderr = open(os.devnull, 'w')
-
- if sys.stdin is None:
- if options.input_file == '-':
- print("Trying to read from stdin but stdin seems closed", file=sys.stderr)
- return False
- sys.stdin = open(os.devnull, 'r')
-
- if sys.stdout is None:
- if options.output_file == '-':
- # Can't replace stdout if the user is piping
- # If this case can even happen, it must be some kind of weird
- # stream.
- print(
- textwrap.dedent(
- """\
- Output was set to stdout '-' but the stream attached to
- stdout does not support the flush() system call. This
- will fail."""
- ),
- file=sys.stderr,
- )
- return False
- sys.stdout = open(os.devnull, 'w')
-
- return True
-
-
-def log_page_orientations(pdfinfo, _log):
- direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'}
- orientations = []
- for n, page in enumerate(pdfinfo):
- angle = page.rotation or 0
- if angle != 0:
- orientations.append('{0}{1}'.format(n + 1, direction.get(angle, '')))
- if orientations:
- _log.info('Page orientations detected: ' + ' '.join(orientations))
-
-
-def preamble(_log):
- _log.debug('ocrmypdf ' + VERSION)
-
-
-def check_environ(options, _log):
- old_envvars = (
- 'OCRMYPDF_TESSERACT',
- 'OCRMYPDF_QPDF',
- 'OCRMYPDF_GS',
- 'OCRMYPDF_UNPAPER',
- )
- for k in old_envvars:
- if k in os.environ:
- _log.warning(
- textwrap.dedent(
- f"""\
- OCRmyPDF no longer uses the environment variable {k}.
- Change PATH to select alternate programs."""
- )
- )
-
-
-def check_input_file(options, _log, start_input_file):
- if options.input_file == '-':
- # stdin
- _log.info('reading file from standard input')
- with open(start_input_file, 'wb') as stream_buffer:
- from shutil import copyfileobj
-
- copyfileobj(sys.stdin.buffer, stream_buffer)
- else:
- try:
- re_symlink(options.input_file, start_input_file, _log)
- except FileNotFoundError:
- _log.error("File not found - " + options.input_file)
- raise InputFileError()
-
-
-def check_requested_output_file(options, _log):
- if options.output_file == '-':
- if sys.stdout.isatty():
- _log.error(
- textwrap.dedent(
- """\
- Output was set to stdout '-' but it looks like stdout
- is connected to a terminal. Please redirect stdout to a
- file."""
- )
- )
- raise BadArgsError()
- elif not is_file_writable(options.output_file):
- _log.error(
- "Output file location ("
- + options.output_file
- + ") "
- + "is not a writable file."
- )
- raise OutputFileAccessError()
-
-
-def report_output_file_size(options, _log, input_file, output_file):
- try:
- output_size = Path(output_file).stat().st_size
- input_size = Path(input_file).stat().st_size
- except FileNotFoundError:
- return # Outputting to stream or something
- ratio = output_size / input_size
- if ratio < 1.35 or input_size < 25000:
- return # Seems fine
-
- reasons = []
- image_preproc = {
- 'deskew',
- 'clean_final',
- 'remove_background',
- 'oversample',
- 'force_ocr',
- }
- for arg in image_preproc:
- attr = getattr(options, arg, None)
- if not attr:
- continue
- reasons.append(
- f"The argument --{arg.replace('_', '-')} was issued, causing transcoding."
- )
-
- if reasons:
- explanation = "Possible reasons for this include:\n" + '\n'.join(reasons) + "\n"
- else:
- explanation = "No reason for this increase is known. Please report this issue."
-
- _log.warning(
- textwrap.dedent(
- f"""\
- The output file size is {ratio:.2f}× larger than the input file.
- {explanation}
- """
- )
- )
-
-
-def check_dependency_versions(options, log):
- check_external_program(
- log=log,
- program='tesseract',
- package={'darwin': 'tesseract', 'linux': 'tesseract-ocr'},
- version_checker=tesseract.version,
- need_version='4.0.0', # using backport for Travis CI
- )
- check_external_program(
- log=log,
- program='gs',
- package='ghostscript',
- version_checker=ghostscript.version,
- need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports
- )
- if ghostscript.version() == '9.24':
- complain(
- "Ghostscript 9.24 contains serious regressions and is not "
- "supported. Please upgrade to Ghostscript 9.25 or use an older "
- "version."
- )
- return ExitCode.missing_dependency
- check_external_program(
- log=log,
- program='qpdf',
- package='qpdf',
- version_checker=qpdf.version,
- need_version='8.0.2',
- )
-
-
-def run_pipeline(args=None):
+def run(args=None):
options = parser.parse_args(args=args)
- options.verbose_abbreviated_path = 1
- if os.environ.get('_OCRMYPDF_THREADS'):
- options.use_threads = True
-
- if not check_closed_streams(options):
- return ExitCode.bad_args
-
- logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
-
- _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
- logging_factory, __name__, logger_args
- )
- preamble(_log)
- check_options(options, _log)
- check_dependency_versions(options, _log)
-
- # Any changes to options will not take effect for options that are already
- # bound to function parameters in the pipeline. (For example
- # options.input_file, options.pdf_renderer are already bound.)
- if not options.jobs:
- options.jobs = available_cpu_count()
-
- # Performance is improved by setting Tesseract to single threaded. In tests
- # this gives better throughput than letting a smaller number of Tesseract
- # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
- # variable, but harmless to set if ignored.
- os.environ.setdefault('OMP_THREAD_LIMIT', '1')
-
- check_environ(options, _log)
- if os.environ.get('PYTEST_CURRENT_TEST'):
- os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
-
- try:
- work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
- options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
- start_input_file = os.path.join(work_folder, 'origin')
-
- check_input_file(options, _log, start_input_file)
- check_requested_output_file(options, _log)
-
- manager = JobContextManager()
- manager.register('JobContext', JobContext) # pylint: disable=no-member
- manager.start()
-
- context = manager.JobContext() # pylint: disable=no-member
- context.set_options(options)
- context.set_work_folder(work_folder)
-
- build_pipeline(options, work_folder, _log, context)
- atexit.register(cleanup_working_files, work_folder, options)
- if hasattr(os, 'nice'):
- os.nice(5)
- cmdline.run(options)
- except ruffus_exceptions.RethrownJobError as e:
- if options.verbose:
- _log.debug(str(e)) # stringify exception so logger doesn't have to
- exceptions = e.job_exceptions
- exitcode = traverse_ruffus_exception(exceptions, options, _log)
- if exitcode is None:
- _log.error("Unexpected ruffus exception: " + str(e))
- _log.error(repr(e))
- return ExitCode.other_error
- return exitcode
- except ExitCodeException as e:
- return e.exit_code
- except Exception as e:
- _log.error(str(e))
- return ExitCode.other_error
-
- if options.flowchart:
- _log.info(f"Flowchart saved to {options.flowchart}")
- return ExitCode.ok
- elif options.output_file == '-':
- _log.info("Output sent to stdout")
- elif os.path.samefile(options.output_file, os.devnull):
- pass # Say nothing when sending to dev null
- else:
- if options.output_type.startswith('pdfa'):
- pdfa_info = file_claims_pdfa(options.output_file)
- if pdfa_info['pass']:
- msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
- _log.info(msg)
- else:
- msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
- _log.warning(msg)
- return ExitCode.pdfa_conversion_failed
- if not qpdf.check(options.output_file, _log):
- _log.warning('Output file: The generated PDF is INVALID')
- return ExitCode.invalid_output_pdf
-
- report_output_file_size(options, _log, start_input_file, options.output_file)
-
- pdfinfo = context.get_pdfinfo()
- if options.verbose:
- from pprint import pformat
-
- _log.debug(pformat(pdfinfo))
-
- log_page_orientations(pdfinfo, _log)
-
- return ExitCode.ok
-
+ return run_pipeline(options)
if __name__ == '__main__':
- sys.exit(run_pipeline())
+ sys.exit(run())
diff --git a/src/ocrmypdf/run.py b/src/ocrmypdf/run.py
new file mode 100644
index 00000000..9d24226f
--- /dev/null
+++ b/src/ocrmypdf/run.py
@@ -0,0 +1,702 @@
+#!/usr/bin/env python3
+# © 2015-17 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF. If not, see .
+
+import atexit
+import logging
+import os
+import re
+import sys
+import textwrap
+from pathlib import Path
+from tempfile import mkdtemp
+
+import PIL
+import ruffus.cmdline as cmdline
+import ruffus.proxy_logger as proxy_logger
+import ruffus.ruffus_exceptions as ruffus_exceptions
+
+from . import VERSION
+from . import exceptions as ocrmypdf_exceptions
+from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
+from ._pipeline import build_pipeline
+from ._unicodefun import verify_python3_env
+from .exceptions import (
+ BadArgsError,
+ ExitCode,
+ ExitCodeException,
+ InputFileError,
+ MissingDependencyError,
+ OutputFileAccessError,
+)
+from .exec import (
+ ghostscript,
+ jbig2enc,
+ qpdf,
+ tesseract,
+ check_external_program,
+ unpaper,
+ pngquant,
+)
+from .helpers import available_cpu_count, is_file_writable, re_symlink
+from .pdfa import file_claims_pdfa
+
+# -------------
+# External dependencies
+
+HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
+
+
+def complain(message):
+ print(*textwrap.wrap(message), file=sys.stderr)
+
+# --------
+# Critical environment tests
+
+verify_python3_env()
+
+
+def check_options_languages(options, _log):
+ if not options.language:
+ options.language = ['eng'] # Enforce English hegemony
+
+ # Support v2.x "eng+deu" language syntax
+ if '+' in options.language[0]:
+ options.language = options.language[0].split('+')
+
+ languages = set(options.language)
+ if not languages.issubset(tesseract.languages()):
+ msg = (
+ "The installed version of tesseract does not have language "
+ "data for the following requested languages: \n"
+ )
+ for lang in languages - tesseract.languages():
+ msg += lang + '\n'
+ raise MissingDependencyError(msg)
+
+
+def check_options_output(options, log):
+ # We have these constraints to check for.
+ # 1. Ghostscript < 9.20 mangles multibyte Unicode
+ # 2. hocr doesn't work on non-Latin languages (so don't select it)
+
+ languages = set(options.language)
+ is_latin = languages.issubset(HOCR_OK_LANGS)
+
+ if options.pdf_renderer == 'hocr' and not is_latin:
+ msg = (
+ "The 'hocr' PDF renderer is known to cause problems with one "
+ "or more of the languages in your document. Use "
+ "--pdf-renderer auto (the default) to avoid this issue."
+ )
+ log.warning(msg)
+
+ if ghostscript.version() < '9.20' and options.output_type != 'pdf' and not is_latin:
+ # https://bugs.ghostscript.com/show_bug.cgi?id=696874
+ # Ghostscript < 9.20 fails to encode multibyte characters properly
+ msg = (
+ "The installed version of Ghostscript does not work correctly "
+ "with the OCR languages you specified. Use --output-type pdf or "
+ "upgrade to Ghostscript 9.20 or later to avoid this issue."
+ )
+ msg += f"Found Ghostscript {ghostscript.version()}"
+ log.warning(msg)
+
+ # Decide on what renderer to use
+ if options.pdf_renderer == 'auto':
+ options.pdf_renderer = 'sandwich'
+
+ if options.output_type == 'pdfa':
+ options.output_type = 'pdfa-2'
+
+ if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
+ raise MissingDependencyError(
+ "--output-type pdfa-3 requires Ghostscript 9.19 or later"
+ )
+
+ lossless_reconstruction = False
+ if not any(
+ (
+ options.deskew,
+ options.clean_final,
+ options.force_ocr,
+ options.remove_background,
+ )
+ ):
+ lossless_reconstruction = True
+ options.lossless_reconstruction = lossless_reconstruction
+
+ if not options.lossless_reconstruction and options.redo_ocr:
+ raise BadArgsError(
+ "--redo-ocr is not currently compatible with --deskew, "
+ "--clean-final, and --remove-background",
+ )
+
+
+def check_options_sidecar(options, log):
+ if options.sidecar == '\0':
+ if options.output_file == '-':
+ raise BadArgsError(
+ "--sidecar filename must be specified when output file is " "stdout.",
+ )
+ options.sidecar = options.output_file + '.txt'
+
+
+def check_options_preprocessing(options, log):
+ if options.clean_final:
+ options.clean = True
+ if options.unpaper_args and not options.clean:
+ raise BadArgsError("--clean is required for --unpaper-args")
+ if options.clean:
+ check_external_program(
+ log=log,
+ program='unpaper',
+ package='unpaper',
+ version_checker=unpaper.version,
+ need_version='6.1',
+ required_for=['--clean, --clean-final'],
+ )
+ try:
+ if options.unpaper_args:
+ options.unpaper_args = unpaper.validate_custom_args(
+ options.unpaper_args
+ )
+ except Exception as e:
+ raise BadArgsError(str(e))
+
+
+def check_options_ocr_behavior(options, log):
+ exclusive_options = sum(
+ [
+ (1 if opt else 0)
+ for opt in (options.force_ocr, options.skip_text, options.redo_ocr)
+ ]
+ )
+ if exclusive_options >= 2:
+ raise BadArgsError(
+ "Error: choose only one of --force-ocr, --skip-text, --redo-ocr."
+ )
+
+
+def check_options_optimizing(options, log):
+ if options.optimize >= 2:
+ check_external_program(
+ log=log,
+ program='pngquant',
+ package='pngquant',
+ version_checker=pngquant.version,
+ need_version='2.0.1',
+ required_for='--optimize {2,3}',
+ )
+
+ if options.optimize >= 2:
+ # Although we use JBIG2 for optimize=1, don't nag about it unless the
+ # user is asking for more optimization
+ check_external_program(
+ log=log,
+ program='jbig2',
+ package='jbig2enc',
+ version_checker=jbig2enc.version,
+ need_version='0.28',
+ required_for='--optimize {2,3} | --jbig2-lossy',
+ recommended=True if not options.jbig2_lossy else False,
+ )
+
+ if options.optimize == 0 and any(
+ [options.jbig2_lossy, options.png_quality, options.jpeg_quality]
+ ):
+ log.warning(
+ "The arguments --jbig2-lossy, --png-quality, and --jpeg-quality "
+ "will be ignored because --optimize=0."
+ )
+
+
+def check_options_advanced(options, log):
+ if options.pdfa_image_compression != 'auto' and options.output_type.startswith(
+ 'pdfa'
+ ):
+ log.warning(
+ "--pdfa-image-compression argument has no effect when "
+ "--output-type is not 'pdfa', 'pdfa-1', or 'pdfa-2'"
+ )
+ if tesseract.v4() and (options.user_words or options.user_patterns):
+ log.warning('Tesseract 4.x ignores --user-words, so this has no effect')
+
+
+def check_options_metadata(options, log):
+ import unicodedata
+
+ docinfo = [options.title, options.author, options.keywords, options.subject]
+ for s in (m for m in docinfo if m):
+ for c in s:
+ if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
+ raise ValueError(
+ "One of the metadata strings contains "
+ "an unsupported Unicode character: '{}' (U+{})".format(
+ c, hex(ord(c))[2:].upper()
+ )
+ )
+
+
+def check_options_pillow(options, log):
+ PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
+ if PIL.Image.MAX_IMAGE_PIXELS == 0:
+ PIL.Image.MAX_IMAGE_PIXELS = None
+
+
+def check_options(options, log):
+ try:
+ check_options_languages(options, log)
+ check_options_metadata(options, log)
+ check_options_output(options, log)
+ check_options_sidecar(options, log)
+ check_options_preprocessing(options, log)
+ check_options_ocr_behavior(options, log)
+ check_options_optimizing(options, log)
+ check_options_advanced(options, log)
+ check_options_pillow(options, log)
+ return ExitCode.ok
+ except ValueError as e:
+ log.error(e)
+ return ExitCode.bad_args
+ except BadArgsError as e:
+ log.error(e)
+ return e.exit_code
+ except MissingDependencyError as e:
+ log.error(e)
+ return ExitCode.missing_dependency
+
+
+# ----------
+# Logging
+
+
+def logging_factory(logger_name, logger_args):
+ verbose = logger_args['verbose']
+ quiet = logger_args['quiet']
+
+ root_logger = logging.getLogger(logger_name)
+ root_logger.setLevel(logging.DEBUG)
+
+ handler = logging.StreamHandler(sys.stderr)
+ formatter_ = logging.Formatter("%(levelname)7s - %(message)s")
+ handler.setFormatter(formatter_)
+ if verbose:
+ handler.setLevel(logging.DEBUG)
+ elif quiet:
+ handler.setLevel(logging.WARNING)
+ else:
+ handler.setLevel(logging.INFO)
+ root_logger.addHandler(handler)
+ return root_logger
+
+
+def cleanup_ruffus_error_message(msg):
+ msg = re.sub(r'\s+', r' ', msg)
+ msg = re.sub(r"\((.+?)\)", r'\1', msg)
+ msg = msg.strip()
+ return msg
+
+
+def do_ruffus_exception(ruffus_five_tuple, options, log):
+ """Replace the elaborate ruffus stack trace with a user friendly
+ description of the error message that occurred."""
+ exit_code = None
+
+ _task_name, _job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
+
+ if isinstance(exc_name, type):
+ # ruffus is full of mystery... sometimes (probably when the process
+ # group leader is killed) exc_name is the class object of the exception,
+ # rather than a str. So reach into the object and get its name.
+ exc_name = exc_name.__name__
+
+ if exc_name.startswith('ocrmypdf.exceptions.'):
+ base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
+ exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
+ exit_code = getattr(exc_class, 'exit_code', ExitCode.other_error)
+ try:
+ if isinstance(exc_value, exc_class):
+ exc_msg = str(exc_value)
+ elif isinstance(exc_value, str):
+ exc_msg = exc_value
+ else:
+ exc_msg = str(exc_class())
+ except Exception:
+ exc_msg = "Unknown"
+
+ if exc_name in ('builtins.SystemExit', 'SystemExit'):
+ match = re.search(r"\.(.+?)\)", exc_value)
+ exit_code_name = match.groups()[0]
+ exit_code = getattr(ExitCode, exit_code_name, 'other_error')
+ elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
+ log.error(cleanup_ruffus_error_message(exc_value))
+ exit_code = ExitCode.input_file
+ elif exc_name in ('builtins.KeyboardInterrupt', 'KeyboardInterrupt'):
+ # We have to print in this case because the log daemon might be toast
+ print("Interrupted by user", file=sys.stderr)
+ exit_code = ExitCode.ctrl_c
+ elif exc_name == 'subprocess.CalledProcessError':
+ # It's up to the subprocess handler to report something useful
+ msg = "Error occurred while running this command:"
+ log.error(msg + '\n' + exc_value)
+ exit_code = ExitCode.child_process_error
+ elif exc_name.startswith('ocrmypdf.exceptions.'):
+ if exc_msg:
+ log.error(exc_msg)
+ elif exc_name == 'PIL.Image.DecompressionBombError':
+ msg = cleanup_ruffus_error_message(exc_value)
+ msg += (
+ "\nUse the --max-image-mpixels argument to set increase the "
+ "maximum number of megapixels to accept."
+ )
+ log.error(msg)
+ exit_code = ExitCode.input_file
+
+ if exit_code is not None:
+ return exit_code
+
+ if not options.verbose:
+ log.error(exc_stack)
+ return ExitCode.other_error
+
+
+def traverse_ruffus_exception(exceptions, options, log):
+ """Traverse a RethrownJobError and output the exceptions
+
+ Ruffus presents exceptions as 5 element tuples. The RethrownJobException
+ has a list of exceptions like
+ e.job_exceptions = [(5-tuple), (5-tuple), ...]
+
+ ruffus < 2.7.0 had a bug with exception marshalling that would give
+ different output whether the main or child process raised the exception.
+ We no longer support this.
+
+ Attempting to log the exception itself will re-marshall it to the logger
+ which is normally running in another process. It's better to avoid re-
+ marshalling.
+
+ The exit code will be based on this, even if multiple exceptions occurred
+ at the same time."""
+
+ exit_codes = []
+ for exc in exceptions:
+ exit_code = do_ruffus_exception(exc, options, log)
+ exit_codes.append(exit_code)
+
+ return exit_codes[0] # Multiple codes are rare so take the first one
+
+
+def check_closed_streams(options):
+ """Work around Python issue with multiprocessing forking on closed streams
+
+ https://bugs.python.org/issue28326
+
+ Attempting to a fork/exec a new Python process when any of std{in,out,err}
+ are closed or not flushable for some reason may raise an exception.
+ Fix this by opening devnull if the handle seems to be closed. Do this
+ globally to avoid tracking places all places that fork.
+
+ Seems to be specific to multiprocessing.Process not all Python process
+ forkers.
+
+ The error actually occurs when the stream object is not flushable,
+ but replacing an open stream object that is not flushable with
+ /dev/null is a bad idea since it will create a silent failure. Replacing
+ a closed handle with /dev/null seems safe.
+
+ """
+
+ if sys.version_info[0:3] >= (3, 6, 4):
+ return True # Issued fixed in Python 3.6.4+
+
+ if sys.stderr is None:
+ sys.stderr = open(os.devnull, 'w')
+
+ if sys.stdin is None:
+ if options.input_file == '-':
+ print("Trying to read from stdin but stdin seems closed", file=sys.stderr)
+ return False
+ sys.stdin = open(os.devnull, 'r')
+
+ if sys.stdout is None:
+ if options.output_file == '-':
+ # Can't replace stdout if the user is piping
+ # If this case can even happen, it must be some kind of weird
+ # stream.
+ print(
+ textwrap.dedent(
+ """\
+ Output was set to stdout '-' but the stream attached to
+ stdout does not support the flush() system call. This
+ will fail."""
+ ),
+ file=sys.stderr,
+ )
+ return False
+ sys.stdout = open(os.devnull, 'w')
+
+ return True
+
+
+def log_page_orientations(pdfinfo, _log):
+ direction = {0: 'n', 90: 'e', 180: 's', 270: 'w'}
+ orientations = []
+ for n, page in enumerate(pdfinfo):
+ angle = page.rotation or 0
+ if angle != 0:
+ orientations.append('{0}{1}'.format(n + 1, direction.get(angle, '')))
+ if orientations:
+ _log.info('Page orientations detected: ' + ' '.join(orientations))
+
+
+def preamble(_log):
+ _log.debug('ocrmypdf ' + VERSION)
+
+
+def check_environ(options, _log):
+ old_envvars = (
+ 'OCRMYPDF_TESSERACT',
+ 'OCRMYPDF_QPDF',
+ 'OCRMYPDF_GS',
+ 'OCRMYPDF_UNPAPER',
+ )
+ for k in old_envvars:
+ if k in os.environ:
+ _log.warning(
+ textwrap.dedent(
+ f"""\
+ OCRmyPDF no longer uses the environment variable {k}.
+ Change PATH to select alternate programs."""
+ )
+ )
+
+
+def check_input_file(options, _log, start_input_file):
+ if options.input_file == '-':
+ # stdin
+ _log.info('reading file from standard input')
+ with open(start_input_file, 'wb') as stream_buffer:
+ from shutil import copyfileobj
+
+ copyfileobj(sys.stdin.buffer, stream_buffer)
+ else:
+ try:
+ re_symlink(options.input_file, start_input_file, _log)
+ except FileNotFoundError:
+ _log.error("File not found - " + options.input_file)
+ raise InputFileError()
+
+
+def check_requested_output_file(options, _log):
+ if options.output_file == '-':
+ if sys.stdout.isatty():
+ _log.error(
+ textwrap.dedent(
+ """\
+ Output was set to stdout '-' but it looks like stdout
+ is connected to a terminal. Please redirect stdout to a
+ file."""
+ )
+ )
+ raise BadArgsError()
+ elif not is_file_writable(options.output_file):
+ _log.error(
+ "Output file location ("
+ + options.output_file
+ + ") "
+ + "is not a writable file."
+ )
+ raise OutputFileAccessError()
+
+
+def report_output_file_size(options, _log, input_file, output_file):
+ try:
+ output_size = Path(output_file).stat().st_size
+ input_size = Path(input_file).stat().st_size
+ except FileNotFoundError:
+ return # Outputting to stream or something
+ ratio = output_size / input_size
+ if ratio < 1.35 or input_size < 25000:
+ return # Seems fine
+
+ reasons = []
+ image_preproc = {
+ 'deskew',
+ 'clean_final',
+ 'remove_background',
+ 'oversample',
+ 'force_ocr',
+ }
+ for arg in image_preproc:
+ attr = getattr(options, arg, None)
+ if not attr:
+ continue
+ reasons.append(
+ f"The argument --{arg.replace('_', '-')} was issued, causing transcoding."
+ )
+
+ if reasons:
+ explanation = "Possible reasons for this include:\n" + '\n'.join(reasons) + "\n"
+ else:
+ explanation = "No reason for this increase is known. Please report this issue."
+
+ _log.warning(
+ textwrap.dedent(
+ f"""\
+ The output file size is {ratio:.2f}× larger than the input file.
+ {explanation}
+ """
+ )
+ )
+
+
+def check_dependency_versions(options, log):
+ check_external_program(
+ log=log,
+ program='tesseract',
+ package={'darwin': 'tesseract', 'linux': 'tesseract-ocr'},
+ version_checker=tesseract.version,
+ need_version='4.0.0', # using backport for Travis CI
+ )
+ check_external_program(
+ log=log,
+ program='gs',
+ package='ghostscript',
+ version_checker=ghostscript.version,
+ need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports
+ )
+ if ghostscript.version() == '9.24':
+ complain(
+ "Ghostscript 9.24 contains serious regressions and is not "
+ "supported. Please upgrade to Ghostscript 9.25 or use an older "
+ "version."
+ )
+ return ExitCode.missing_dependency
+ check_external_program(
+ log=log,
+ program='qpdf',
+ package='qpdf',
+ version_checker=qpdf.version,
+ need_version='8.0.2',
+ )
+
+
+def run_pipeline(options):
+ options.verbose_abbreviated_path = 1
+ if os.environ.get('_OCRMYPDF_THREADS'):
+ options.use_threads = True
+
+ if not check_closed_streams(options):
+ return ExitCode.bad_args
+
+ logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
+
+ _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
+ logging_factory, __name__, logger_args
+ )
+ preamble(_log)
+ check_code = check_options(options, _log)
+ if check_code != ExitCode.ok:
+ return check_code
+ check_dependency_versions(options, _log)
+
+ # Any changes to options will not take effect for options that are already
+ # bound to function parameters in the pipeline. (For example
+ # options.input_file, options.pdf_renderer are already bound.)
+ if not options.jobs:
+ options.jobs = available_cpu_count()
+
+ # Performance is improved by setting Tesseract to single threaded. In tests
+ # this gives better throughput than letting a smaller number of Tesseract
+ # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
+ # variable, but harmless to set if ignored.
+ os.environ.setdefault('OMP_THREAD_LIMIT', '1')
+
+ check_environ(options, _log)
+ if os.environ.get('PYTEST_CURRENT_TEST'):
+ os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
+
+ try:
+ work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
+ options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
+ start_input_file = os.path.join(work_folder, 'origin')
+
+ check_input_file(options, _log, start_input_file)
+ check_requested_output_file(options, _log)
+
+ manager = JobContextManager()
+ manager.register('JobContext', JobContext) # pylint: disable=no-member
+ manager.start()
+
+ context = manager.JobContext() # pylint: disable=no-member
+ context.set_options(options)
+ context.set_work_folder(work_folder)
+
+ build_pipeline(options, work_folder, _log, context)
+ atexit.register(cleanup_working_files, work_folder, options)
+ if hasattr(os, 'nice'):
+ os.nice(5)
+ cmdline.run(options)
+ except ruffus_exceptions.RethrownJobError as e:
+ if options.verbose:
+ _log.debug(str(e)) # stringify exception so logger doesn't have to
+ exceptions = e.job_exceptions
+ exitcode = traverse_ruffus_exception(exceptions, options, _log)
+ if exitcode is None:
+ _log.error("Unexpected ruffus exception: " + str(e))
+ _log.error(repr(e))
+ return ExitCode.other_error
+ return exitcode
+ except ExitCodeException as e:
+ return e.exit_code
+ except Exception as e:
+ _log.error(str(e))
+ return ExitCode.other_error
+
+ if options.flowchart:
+ _log.info(f"Flowchart saved to {options.flowchart}")
+ return ExitCode.ok
+ elif options.output_file == '-':
+ _log.info("Output sent to stdout")
+ elif os.path.samefile(options.output_file, os.devnull):
+ pass # Say nothing when sending to dev null
+ else:
+ if options.output_type.startswith('pdfa'):
+ pdfa_info = file_claims_pdfa(options.output_file)
+ if pdfa_info['pass']:
+ msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
+ _log.info(msg)
+ else:
+ msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
+ _log.warning(msg)
+ return ExitCode.pdfa_conversion_failed
+ if not qpdf.check(options.output_file, _log):
+ _log.warning('Output file: The generated PDF is INVALID')
+ return ExitCode.invalid_output_pdf
+
+ report_output_file_size(options, _log, start_input_file, options.output_file)
+
+ pdfinfo = context.get_pdfinfo()
+ if options.verbose:
+ from pprint import pformat
+
+ _log.debug(pformat(pdfinfo))
+
+ log_page_orientations(pdfinfo, _log)
+
+ return ExitCode.ok