diff --git a/src/ocrmypdf/__init__.py b/src/ocrmypdf/__init__.py
index fd06f2be..a37d2658 100644
--- a/src/ocrmypdf/__init__.py
+++ b/src/ocrmypdf/__init__.py
@@ -44,4 +44,3 @@ from . import hocrtransform
from . import leptonica
from . import pdfa
from . import pdfinfo
-from . import run
diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py
index 9a7b1b66..759fffe4 100755
--- a/src/ocrmypdf/__main__.py
+++ b/src/ocrmypdf/__main__.py
@@ -21,7 +21,7 @@ import os
import sys
from . import PROGRAM_NAME, VERSION
-from .run import run_pipeline
+from ._ruffus import run_pipeline
# Hack to help debugger context find /usr/local/bin
if 'IDE_PROJECT_ROOTS' in os.environ:
@@ -30,6 +30,7 @@ if 'IDE_PROJECT_ROOTS' in os.environ:
# -------------
# Parser
+
def numeric(basetype, min_=None, max_=None):
"""Validator for numeric params"""
min_ = basetype(min_) if min_ is not None else None
@@ -465,9 +466,11 @@ debugging.add_argument(
'--flowchart', type=str, help="Generate the pipeline execution flowchart"
)
+
def run(args=None):
options = parser.parse_args(args=args)
return run_pipeline(options)
+
if __name__ == '__main__':
sys.exit(run())
diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
index b283a988..d6e0c491 100644
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -25,13 +25,11 @@ from shutil import copyfile, copyfileobj
import img2pdf
from PIL import Image
-from ruffus import Pipeline, formatter, regex, suffix
import pikepdf
from pikepdf.models.metadata import encode_pdf_date
from . import PROGRAM_NAME, VERSION, leptonica
-from ._weave import weave_layers
from .exceptions import (
DpiError,
EncryptedPdfError,
@@ -40,7 +38,12 @@ from .exceptions import (
UnsupportedImageFormatError,
)
from .exec import ghostscript, tesseract
-from .helpers import flatten_groups, is_iterable_notstr, page_number, re_symlink
+from .helpers import (
+ flatten_groups,
+ is_iterable_notstr,
+ page_number,
+ re_symlink
+)
from .hocrtransform import HocrTransform
from .optimize import optimize
from .pdfa import generate_pdfa_ps
@@ -115,7 +118,10 @@ def triage_image_file(input_file, output_file, log, options):
)
with open(output_file, 'wb') as outf:
img2pdf.convert(
- input_file, layout_fun=layout_fun, with_pdfrw=False, outputstream=outf
+ input_file,
+ layout_fun=layout_fun,
+ with_pdfrw=False,
+ outputstream=outf
)
log.info("Successfully converted to PDF, processing...")
except img2pdf.ImageOpenError as e:
@@ -170,7 +176,7 @@ def repair_and_parse_pdf(input_file, output_file, log, context):
pdfinfo = PdfInfo(
output_file, detailed_page_analysis=detailed_page_analysis, log=log
)
- except pikepdf.PasswordError as e:
+ except pikepdf.PasswordError:
raise EncryptedPdfError()
except pikepdf.PdfError as e:
log.error(e)
@@ -202,7 +208,8 @@ def repair_and_parse_pdf(input_file, output_file, log, context):
raise PriorOcrFoundError()
else:
log.warning(
- "This PDF has a fillable form. Chances are it is a pure digital "
+ "This PDF has a fillable form. "
+ "Chances are it is a pure digital "
"document that does not need OCR."
)
if not options.force_ocr:
@@ -281,8 +288,7 @@ def is_ocr_required(pageinfo, log, options):
elif options.redo_ocr:
if pageinfo.has_corrupt_text:
log.warning(
- prefix
- + (
+ prefix + (
"some text on this page cannot be mapped to characters: "
"consider using --force-ocr instead",
)
@@ -936,232 +942,3 @@ def copy_final(input_files, output_file, log, context):
# get the appropriate umask, ownership, etc.
with open(output_file, 'wb') as output_stream:
copyfileobj(input_stream, output_stream)
-
-
-def build_pipeline(options, work_folder, log, context):
- main_pipeline = Pipeline.pipelines['main']
-
- # Triage
- task_triage = main_pipeline.transform(
- task_func=triage,
- input=os.path.join(work_folder, 'origin'),
- filter=formatter('(?i)'),
- output=os.path.join(work_folder, 'origin.pdf'),
- extras=[log, context],
- )
-
- task_repair_and_parse_pdf = main_pipeline.transform(
- task_func=repair_and_parse_pdf,
- input=task_triage,
- filter=suffix('.pdf'),
- output='.repaired.pdf',
- output_dir=work_folder,
- extras=[log, context],
- )
-
- # Split (kwargs for split seems to be broken, so pass plain args)
- task_marker_pages = main_pipeline.split(
- marker_pages,
- task_repair_and_parse_pdf,
- os.path.join(work_folder, '*.marker.pdf'),
- extras=[log, context],
- )
-
- task_ocr_or_skip = main_pipeline.split(
- ocr_or_skip,
- task_marker_pages,
- [
- os.path.join(work_folder, '*.ocr.page.pdf'),
- os.path.join(work_folder, '*.skip.page.pdf'),
- ],
- extras=[log, context],
- )
-
- # Rasterize preview
- task_rasterize_preview = main_pipeline.transform(
- task_func=rasterize_preview,
- input=task_ocr_or_skip,
- filter=suffix('.page.pdf'),
- output='.preview.jpg',
- output_dir=work_folder,
- extras=[log, context],
- )
- task_rasterize_preview.active_if(options.rotate_pages)
-
- # Orient
- task_orient_page = main_pipeline.collate(
- task_func=orient_page,
- input=[task_ocr_or_skip, task_rasterize_preview],
- filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
- output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
- extras=[log, context],
- )
-
- # Rasterize actual
- task_rasterize_with_ghostscript = main_pipeline.transform(
- task_func=rasterize_with_ghostscript,
- input=task_orient_page,
- filter=suffix('.ocr.oriented.pdf'),
- output='.page.png',
- output_dir=work_folder,
- extras=[log, context],
- )
-
- # Preprocessing subpipeline
- task_preprocess_remove_background = main_pipeline.transform(
- task_func=preprocess_remove_background,
- input=task_rasterize_with_ghostscript,
- filter=suffix(".page.png"),
- output=".pp-background.png",
- extras=[log, context],
- )
-
- task_preprocess_deskew = main_pipeline.transform(
- task_func=preprocess_deskew,
- input=task_preprocess_remove_background,
- filter=suffix(".pp-background.png"),
- output=".pp-deskew.png",
- extras=[log, context],
- )
-
- task_preprocess_clean = main_pipeline.transform(
- task_func=preprocess_clean,
- input=task_preprocess_deskew,
- filter=suffix(".pp-deskew.png"),
- output=".pp-clean.png",
- extras=[log, context],
- )
-
- task_select_ocr_image = main_pipeline.collate(
- task_func=select_ocr_image,
- input=[task_preprocess_clean],
- filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
- output=os.path.join(work_folder, r"\1.ocr.png"),
- extras=[log, context],
- )
-
- # HOCR OCR
- task_ocr_tesseract_hocr = main_pipeline.transform(
- task_func=ocr_tesseract_hocr,
- input=task_select_ocr_image,
- filter=suffix(".ocr.png"),
- output=[".hocr", ".txt"],
- extras=[log, context],
- )
- task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
- task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
-
- task_select_visible_page_image = main_pipeline.collate(
- task_func=select_visible_page_image,
- input=[
- task_rasterize_with_ghostscript,
- task_preprocess_remove_background,
- task_preprocess_deskew,
- task_preprocess_clean,
- ],
- filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
- output=os.path.join(work_folder, r'\1.image'),
- extras=[log, context],
- )
- task_select_visible_page_image.graphviz(shape='diamond')
-
- task_select_image_layer = main_pipeline.collate(
- task_func=select_image_layer,
- input=[task_select_visible_page_image, task_orient_page],
- filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
- output=os.path.join(work_folder, r'\1.image-layer.pdf'),
- extras=[log, context],
- )
- task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond')
-
- task_render_hocr_page = main_pipeline.transform(
- task_func=render_hocr_page,
- input=task_ocr_tesseract_hocr,
- filter=regex(r".*/(\d{6})(?:\.hocr)"),
- output=os.path.join(work_folder, r'\1.text.pdf'),
- extras=[log, context],
- )
- task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
- task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
-
- # Tesseract OCR + text only PDF
- task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
- task_func=ocr_tesseract_textonly_pdf,
- input=[task_select_ocr_image],
- filter=regex(r".*/(\d{6})(?:\.ocr.png)"),
- output=[
- os.path.join(work_folder, r'\1.text.pdf'),
- os.path.join(work_folder, r'\1.text.txt'),
- ],
- extras=[log, context],
- )
- task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
- task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
-
- task_weave_layers = main_pipeline.collate(
- task_func=weave_layers,
- input=[
- task_repair_and_parse_pdf,
- task_render_hocr_page,
- task_ocr_tesseract_textonly_pdf,
- task_select_image_layer,
- ],
- filter=regex(
- r".*/((?:\d{6}(?:\.text\.pdf|\.image-layer\.pdf))|(?:origin\.repaired\.pdf))"
- ),
- output=os.path.join(work_folder, r'layers.rendered.pdf'),
- extras=[log, context],
- )
- task_weave_layers.graphviz(fillcolor='"#00cc66"')
-
- # PDF/A pdfmark
- task_generate_postscript_stub = main_pipeline.transform(
- task_func=generate_postscript_stub,
- input=task_repair_and_parse_pdf,
- filter=formatter(r'\.repaired\.pdf'),
- output=os.path.join(work_folder, 'pdfa.ps'),
- extras=[log, context],
- )
- task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa'))
-
- # PDF/A conversion
- task_convert_to_pdfa = main_pipeline.merge(
- task_func=convert_to_pdfa,
- input=[task_generate_postscript_stub, task_weave_layers],
- output=os.path.join(work_folder, 'pdfa.pdf'),
- extras=[log, context],
- )
- task_convert_to_pdfa.active_if(options.output_type.startswith('pdfa'))
-
- task_metadata_fixup = main_pipeline.merge(
- task_func=metadata_fixup,
- input=[task_repair_and_parse_pdf, task_weave_layers, task_convert_to_pdfa],
- output=os.path.join(work_folder, 'metafix.pdf'),
- extras=[log, context],
- )
-
- task_merge_sidecars = main_pipeline.merge(
- task_func=merge_sidecars,
- input=[task_ocr_tesseract_hocr, task_ocr_tesseract_textonly_pdf],
- output=options.sidecar,
- extras=[log, context],
- )
- task_merge_sidecars.active_if(options.sidecar)
-
- # Optimize
- task_optimize_pdf = main_pipeline.transform(
- task_func=optimize_pdf,
- input=task_metadata_fixup,
- filter=suffix('.pdf'),
- output='.optimized.pdf',
- output_dir=work_folder,
- extras=[log, context],
- )
-
- # Finalize
- main_pipeline.merge(
- task_func=copy_final,
- input=[task_optimize_pdf],
- output=options.output_file,
- extras=[log, context],
- )
diff --git a/src/ocrmypdf/_ruffus.py b/src/ocrmypdf/_ruffus.py
new file mode 100644
index 00000000..4cc69043
--- /dev/null
+++ b/src/ocrmypdf/_ruffus.py
@@ -0,0 +1,508 @@
+# © 2016 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF. If not, see .
+
+import os
+import re
+import sys
+import atexit
+from tempfile import mkdtemp
+from ruffus import (
+ Pipeline,
+ formatter,
+ regex,
+ suffix,
+ cmdline,
+ proxy_logger,
+ ruffus_exceptions
+)
+from .exec import qpdf
+from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
+from ._weave import weave_layers
+from ._pipeline import (
+ triage,
+ repair_and_parse_pdf,
+ marker_pages,
+ ocr_or_skip,
+ rasterize_preview,
+ orient_page,
+ rasterize_with_ghostscript,
+ preprocess_remove_background,
+ preprocess_deskew,
+ preprocess_clean,
+ select_ocr_image,
+ ocr_tesseract_hocr,
+ select_visible_page_image,
+ select_image_layer,
+ render_hocr_page,
+ ocr_tesseract_textonly_pdf,
+ generate_postscript_stub,
+ convert_to_pdfa,
+ metadata_fixup,
+ merge_sidecars,
+ optimize_pdf,
+ copy_final
+)
+from . import exceptions as ocrmypdf_exceptions
+from .exceptions import (
+ ExitCode,
+ ExitCodeException,
+)
+from .helpers import available_cpu_count
+from .pdfa import file_claims_pdfa
+from ._validation import (
+ check_closed_streams,
+ preamble,
+ check_options,
+ check_dependency_versions,
+ check_environ,
+ check_input_file,
+ check_requested_output_file,
+ report_output_file_size,
+ log_page_orientations,
+ logging_factory,
+)
+
+
+def cleanup_ruffus_error_message(msg):
+ msg = re.sub(r'\s+', r' ', msg)
+ msg = re.sub(r"\((.+?)\)", r'\1', msg)
+ msg = msg.strip()
+ return msg
+
+
+def do_ruffus_exception(ruffus_five_tuple, options, log):
+ """Replace the elaborate ruffus stack trace with a user friendly
+ description of the error message that occurred."""
+ exit_code = None
+
+ _task_name, _job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
+
+ if isinstance(exc_name, type):
+ # ruffus is full of mystery... sometimes (probably when the process
+ # group leader is killed) exc_name is the class object of the exception,
+ # rather than a str. So reach into the object and get its name.
+ exc_name = exc_name.__name__
+
+ if exc_name.startswith('ocrmypdf.exceptions.'):
+ base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
+ exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
+ exit_code = getattr(exc_class, 'exit_code', ExitCode.other_error)
+ try:
+ if isinstance(exc_value, exc_class):
+ exc_msg = str(exc_value)
+ elif isinstance(exc_value, str):
+ exc_msg = exc_value
+ else:
+ exc_msg = str(exc_class())
+ except Exception:
+ exc_msg = "Unknown"
+
+ if exc_name in ('builtins.SystemExit', 'SystemExit'):
+ match = re.search(r"\.(.+?)\)", exc_value)
+ exit_code_name = match.groups()[0]
+ exit_code = getattr(ExitCode, exit_code_name, 'other_error')
+ elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
+ log.error(cleanup_ruffus_error_message(exc_value))
+ exit_code = ExitCode.input_file
+ elif exc_name in ('builtins.KeyboardInterrupt', 'KeyboardInterrupt'):
+ # We have to print in this case because the log daemon might be toast
+ print("Interrupted by user", file=sys.stderr)
+ exit_code = ExitCode.ctrl_c
+ elif exc_name == 'subprocess.CalledProcessError':
+ # It's up to the subprocess handler to report something useful
+ msg = "Error occurred while running this command:"
+ log.error(msg + '\n' + exc_value)
+ exit_code = ExitCode.child_process_error
+ elif exc_name.startswith('ocrmypdf.exceptions.'):
+ if exc_msg:
+ log.error(exc_msg)
+ elif exc_name == 'PIL.Image.DecompressionBombError':
+ msg = cleanup_ruffus_error_message(exc_value)
+ msg += (
+ "\nUse the --max-image-mpixels argument to set increase the "
+ "maximum number of megapixels to accept."
+ )
+ log.error(msg)
+ exit_code = ExitCode.input_file
+
+ if exit_code is not None:
+ return exit_code
+
+ if not options.verbose:
+ log.error(exc_stack)
+ return ExitCode.other_error
+
+
+def traverse_ruffus_exception(exceptions, options, log):
+ """Traverse a RethrownJobError and output the exceptions
+
+ Ruffus presents exceptions as 5 element tuples. The RethrownJobException
+ has a list of exceptions like
+ e.job_exceptions = [(5-tuple), (5-tuple), ...]
+
+ ruffus < 2.7.0 had a bug with exception marshalling that would give
+ different output whether the main or child process raised the exception.
+ We no longer support this.
+
+ Attempting to log the exception itself will re-marshall it to the logger
+ which is normally running in another process. It's better to avoid re-
+ marshalling.
+
+ The exit code will be based on this, even if multiple exceptions occurred
+ at the same time."""
+
+ exit_codes = []
+ for exc in exceptions:
+ exit_code = do_ruffus_exception(exc, options, log)
+ exit_codes.append(exit_code)
+
+ return exit_codes[0] # Multiple codes are rare so take the first one
+
+
+def build_pipeline(options, work_folder, log, context):
+ main_pipeline = Pipeline.pipelines['main']
+
+ # Triage
+ task_triage = main_pipeline.transform(
+ task_func=triage,
+ input=os.path.join(work_folder, 'origin'),
+ filter=formatter('(?i)'),
+ output=os.path.join(work_folder, 'origin.pdf'),
+ extras=[log, context],
+ )
+
+ task_repair_and_parse_pdf = main_pipeline.transform(
+ task_func=repair_and_parse_pdf,
+ input=task_triage,
+ filter=suffix('.pdf'),
+ output='.repaired.pdf',
+ output_dir=work_folder,
+ extras=[log, context],
+ )
+
+ # Split (kwargs for split seems to be broken, so pass plain args)
+ task_marker_pages = main_pipeline.split(
+ marker_pages,
+ task_repair_and_parse_pdf,
+ os.path.join(work_folder, '*.marker.pdf'),
+ extras=[log, context],
+ )
+
+ task_ocr_or_skip = main_pipeline.split(
+ ocr_or_skip,
+ task_marker_pages,
+ [
+ os.path.join(work_folder, '*.ocr.page.pdf'),
+ os.path.join(work_folder, '*.skip.page.pdf'),
+ ],
+ extras=[log, context],
+ )
+
+ # Rasterize preview
+ task_rasterize_preview = main_pipeline.transform(
+ task_func=rasterize_preview,
+ input=task_ocr_or_skip,
+ filter=suffix('.page.pdf'),
+ output='.preview.jpg',
+ output_dir=work_folder,
+ extras=[log, context],
+ )
+ task_rasterize_preview.active_if(options.rotate_pages)
+
+ # Orient
+ task_orient_page = main_pipeline.collate(
+ task_func=orient_page,
+ input=[task_ocr_or_skip, task_rasterize_preview],
+ filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
+ output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
+ extras=[log, context],
+ )
+
+ # Rasterize actual
+ task_rasterize_with_ghostscript = main_pipeline.transform(
+ task_func=rasterize_with_ghostscript,
+ input=task_orient_page,
+ filter=suffix('.ocr.oriented.pdf'),
+ output='.page.png',
+ output_dir=work_folder,
+ extras=[log, context],
+ )
+
+ # Preprocessing subpipeline
+ task_preprocess_remove_background = main_pipeline.transform(
+ task_func=preprocess_remove_background,
+ input=task_rasterize_with_ghostscript,
+ filter=suffix(".page.png"),
+ output=".pp-background.png",
+ extras=[log, context],
+ )
+
+ task_preprocess_deskew = main_pipeline.transform(
+ task_func=preprocess_deskew,
+ input=task_preprocess_remove_background,
+ filter=suffix(".pp-background.png"),
+ output=".pp-deskew.png",
+ extras=[log, context],
+ )
+
+ task_preprocess_clean = main_pipeline.transform(
+ task_func=preprocess_clean,
+ input=task_preprocess_deskew,
+ filter=suffix(".pp-deskew.png"),
+ output=".pp-clean.png",
+ extras=[log, context],
+ )
+
+ task_select_ocr_image = main_pipeline.collate(
+ task_func=select_ocr_image,
+ input=[task_preprocess_clean],
+ filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
+ output=os.path.join(work_folder, r"\1.ocr.png"),
+ extras=[log, context],
+ )
+
+ # HOCR OCR
+ task_ocr_tesseract_hocr = main_pipeline.transform(
+ task_func=ocr_tesseract_hocr,
+ input=task_select_ocr_image,
+ filter=suffix(".ocr.png"),
+ output=[".hocr", ".txt"],
+ extras=[log, context],
+ )
+ task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
+ task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
+
+ task_select_visible_page_image = main_pipeline.collate(
+ task_func=select_visible_page_image,
+ input=[
+ task_rasterize_with_ghostscript,
+ task_preprocess_remove_background,
+ task_preprocess_deskew,
+ task_preprocess_clean,
+ ],
+ filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
+ output=os.path.join(work_folder, r'\1.image'),
+ extras=[log, context],
+ )
+ task_select_visible_page_image.graphviz(shape='diamond')
+
+ task_select_image_layer = main_pipeline.collate(
+ task_func=select_image_layer,
+ input=[task_select_visible_page_image, task_orient_page],
+ filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
+ output=os.path.join(work_folder, r'\1.image-layer.pdf'),
+ extras=[log, context],
+ )
+ task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond')
+
+ task_render_hocr_page = main_pipeline.transform(
+ task_func=render_hocr_page,
+ input=task_ocr_tesseract_hocr,
+ filter=regex(r".*/(\d{6})(?:\.hocr)"),
+ output=os.path.join(work_folder, r'\1.text.pdf'),
+ extras=[log, context],
+ )
+ task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
+ task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
+
+ # Tesseract OCR + text only PDF
+ task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
+ task_func=ocr_tesseract_textonly_pdf,
+ input=[task_select_ocr_image],
+ filter=regex(r".*/(\d{6})(?:\.ocr.png)"),
+ output=[
+ os.path.join(work_folder, r'\1.text.pdf'),
+ os.path.join(work_folder, r'\1.text.txt'),
+ ],
+ extras=[log, context],
+ )
+ task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
+ task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
+
+ task_weave_layers = main_pipeline.collate(
+ task_func=weave_layers,
+ input=[
+ task_repair_and_parse_pdf,
+ task_render_hocr_page,
+ task_ocr_tesseract_textonly_pdf,
+ task_select_image_layer,
+ ],
+ filter=regex(
+ r".*/((?:\d{6}(?:\.text\.pdf|\.image-layer\.pdf))|(?:origin\.repaired\.pdf))"
+ ),
+ output=os.path.join(work_folder, r'layers.rendered.pdf'),
+ extras=[log, context],
+ )
+ task_weave_layers.graphviz(fillcolor='"#00cc66"')
+
+ # PDF/A pdfmark
+ task_generate_postscript_stub = main_pipeline.transform(
+ task_func=generate_postscript_stub,
+ input=task_repair_and_parse_pdf,
+ filter=formatter(r'\.repaired\.pdf'),
+ output=os.path.join(work_folder, 'pdfa.ps'),
+ extras=[log, context],
+ )
+ task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa'))
+
+ # PDF/A conversion
+ task_convert_to_pdfa = main_pipeline.merge(
+ task_func=convert_to_pdfa,
+ input=[task_generate_postscript_stub, task_weave_layers],
+ output=os.path.join(work_folder, 'pdfa.pdf'),
+ extras=[log, context],
+ )
+ task_convert_to_pdfa.active_if(options.output_type.startswith('pdfa'))
+
+ task_metadata_fixup = main_pipeline.merge(
+ task_func=metadata_fixup,
+ input=[task_repair_and_parse_pdf, task_weave_layers, task_convert_to_pdfa],
+ output=os.path.join(work_folder, 'metafix.pdf'),
+ extras=[log, context],
+ )
+
+ task_merge_sidecars = main_pipeline.merge(
+ task_func=merge_sidecars,
+ input=[task_ocr_tesseract_hocr, task_ocr_tesseract_textonly_pdf],
+ output=options.sidecar,
+ extras=[log, context],
+ )
+ task_merge_sidecars.active_if(options.sidecar)
+
+ # Optimize
+ task_optimize_pdf = main_pipeline.transform(
+ task_func=optimize_pdf,
+ input=task_metadata_fixup,
+ filter=suffix('.pdf'),
+ output='.optimized.pdf',
+ output_dir=work_folder,
+ extras=[log, context],
+ )
+
+ # Finalize
+ main_pipeline.merge(
+ task_func=copy_final,
+ input=[task_optimize_pdf],
+ output=options.output_file,
+ extras=[log, context],
+ )
+
+
+def run_pipeline(options):
+ options.verbose_abbreviated_path = 1
+ if os.environ.get('_OCRMYPDF_THREADS'):
+ options.use_threads = True
+
+ if not check_closed_streams(options):
+ return ExitCode.bad_args
+
+ logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
+
+ _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
+ logging_factory, __name__, logger_args
+ )
+ preamble(_log)
+ check_code = check_options(options, _log)
+ if check_code != ExitCode.ok:
+ return check_code
+ check_dependency_versions(options, _log)
+
+ # Any changes to options will not take effect for options that are already
+ # bound to function parameters in the pipeline. (For example
+ # options.input_file, options.pdf_renderer are already bound.)
+ if not options.jobs:
+ options.jobs = available_cpu_count()
+
+ # Performance is improved by setting Tesseract to single threaded. In tests
+ # this gives better throughput than letting a smaller number of Tesseract
+ # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
+ # variable, but harmless to set if ignored.
+ os.environ.setdefault('OMP_THREAD_LIMIT', '1')
+
+ check_environ(options, _log)
+ if os.environ.get('PYTEST_CURRENT_TEST'):
+ os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
+
+ try:
+ work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
+ options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
+ start_input_file = os.path.join(work_folder, 'origin')
+
+ check_input_file(options, _log, start_input_file)
+ check_requested_output_file(options, _log)
+
+ manager = JobContextManager()
+ manager.register('JobContext', JobContext) # pylint: disable=no-member
+ manager.start()
+
+ context = manager.JobContext() # pylint: disable=no-member
+ context.set_options(options)
+ context.set_work_folder(work_folder)
+
+ build_pipeline(options, work_folder, _log, context)
+ atexit.register(cleanup_working_files, work_folder, options)
+ if hasattr(os, 'nice'):
+ os.nice(5)
+ cmdline.run(options)
+ except ruffus_exceptions.RethrownJobError as e:
+ if options.verbose:
+ _log.debug(str(e)) # stringify exception so logger doesn't have to
+ exceptions = e.job_exceptions
+ exitcode = traverse_ruffus_exception(exceptions, options, _log)
+ if exitcode is None:
+ _log.error("Unexpected ruffus exception: " + str(e))
+ _log.error(repr(e))
+ return ExitCode.other_error
+ return exitcode
+ except ExitCodeException as e:
+ return e.exit_code
+ except Exception as e:
+ _log.error(str(e))
+ return ExitCode.other_error
+
+ if options.flowchart:
+ _log.info(f"Flowchart saved to {options.flowchart}")
+ return ExitCode.ok
+ elif options.output_file == '-':
+ _log.info("Output sent to stdout")
+ elif os.path.samefile(options.output_file, os.devnull):
+ pass # Say nothing when sending to dev null
+ else:
+ if options.output_type.startswith('pdfa'):
+ pdfa_info = file_claims_pdfa(options.output_file)
+ if pdfa_info['pass']:
+ msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
+ _log.info(msg)
+ else:
+ msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
+ _log.warning(msg)
+ return ExitCode.pdfa_conversion_failed
+ if not qpdf.check(options.output_file, _log):
+ _log.warning('Output file: The generated PDF is INVALID')
+ return ExitCode.invalid_output_pdf
+
+ report_output_file_size(options, _log, start_input_file, options.output_file)
+
+ pdfinfo = context.get_pdfinfo()
+ if options.verbose:
+ from pprint import pformat
+
+ _log.debug(pformat(pdfinfo))
+
+ log_page_orientations(pdfinfo, _log)
+
+ return ExitCode.ok
diff --git a/src/ocrmypdf/run.py b/src/ocrmypdf/_validation.py
similarity index 64%
rename from src/ocrmypdf/run.py
rename to src/ocrmypdf/_validation.py
index 9d24226f..871c412d 100644
--- a/src/ocrmypdf/run.py
+++ b/src/ocrmypdf/_validation.py
@@ -16,33 +16,20 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see .
-import atexit
+
import logging
import os
-import re
+
import sys
import textwrap
from pathlib import Path
-from tempfile import mkdtemp
import PIL
-import ruffus.cmdline as cmdline
-import ruffus.proxy_logger as proxy_logger
-import ruffus.ruffus_exceptions as ruffus_exceptions
from . import VERSION
-from . import exceptions as ocrmypdf_exceptions
-from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
-from ._pipeline import build_pipeline
+
from ._unicodefun import verify_python3_env
-from .exceptions import (
- BadArgsError,
- ExitCode,
- ExitCodeException,
- InputFileError,
- MissingDependencyError,
- OutputFileAccessError,
-)
+
from .exec import (
ghostscript,
jbig2enc,
@@ -52,8 +39,14 @@ from .exec import (
unpaper,
pngquant,
)
-from .helpers import available_cpu_count, is_file_writable, re_symlink
-from .pdfa import file_claims_pdfa
+from .helpers import is_file_writable, re_symlink
+from .exceptions import (
+ BadArgsError,
+ ExitCode,
+ InputFileError,
+ MissingDependencyError,
+ OutputFileAccessError,
+)
# -------------
# External dependencies
@@ -64,9 +57,9 @@ HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
def complain(message):
print(*textwrap.wrap(message), file=sys.stderr)
+
# --------
# Critical environment tests
-
verify_python3_env()
@@ -305,102 +298,6 @@ def logging_factory(logger_name, logger_args):
return root_logger
-def cleanup_ruffus_error_message(msg):
- msg = re.sub(r'\s+', r' ', msg)
- msg = re.sub(r"\((.+?)\)", r'\1', msg)
- msg = msg.strip()
- return msg
-
-
-def do_ruffus_exception(ruffus_five_tuple, options, log):
- """Replace the elaborate ruffus stack trace with a user friendly
- description of the error message that occurred."""
- exit_code = None
-
- _task_name, _job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
-
- if isinstance(exc_name, type):
- # ruffus is full of mystery... sometimes (probably when the process
- # group leader is killed) exc_name is the class object of the exception,
- # rather than a str. So reach into the object and get its name.
- exc_name = exc_name.__name__
-
- if exc_name.startswith('ocrmypdf.exceptions.'):
- base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
- exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
- exit_code = getattr(exc_class, 'exit_code', ExitCode.other_error)
- try:
- if isinstance(exc_value, exc_class):
- exc_msg = str(exc_value)
- elif isinstance(exc_value, str):
- exc_msg = exc_value
- else:
- exc_msg = str(exc_class())
- except Exception:
- exc_msg = "Unknown"
-
- if exc_name in ('builtins.SystemExit', 'SystemExit'):
- match = re.search(r"\.(.+?)\)", exc_value)
- exit_code_name = match.groups()[0]
- exit_code = getattr(ExitCode, exit_code_name, 'other_error')
- elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
- log.error(cleanup_ruffus_error_message(exc_value))
- exit_code = ExitCode.input_file
- elif exc_name in ('builtins.KeyboardInterrupt', 'KeyboardInterrupt'):
- # We have to print in this case because the log daemon might be toast
- print("Interrupted by user", file=sys.stderr)
- exit_code = ExitCode.ctrl_c
- elif exc_name == 'subprocess.CalledProcessError':
- # It's up to the subprocess handler to report something useful
- msg = "Error occurred while running this command:"
- log.error(msg + '\n' + exc_value)
- exit_code = ExitCode.child_process_error
- elif exc_name.startswith('ocrmypdf.exceptions.'):
- if exc_msg:
- log.error(exc_msg)
- elif exc_name == 'PIL.Image.DecompressionBombError':
- msg = cleanup_ruffus_error_message(exc_value)
- msg += (
- "\nUse the --max-image-mpixels argument to set increase the "
- "maximum number of megapixels to accept."
- )
- log.error(msg)
- exit_code = ExitCode.input_file
-
- if exit_code is not None:
- return exit_code
-
- if not options.verbose:
- log.error(exc_stack)
- return ExitCode.other_error
-
-
-def traverse_ruffus_exception(exceptions, options, log):
- """Traverse a RethrownJobError and output the exceptions
-
- Ruffus presents exceptions as 5 element tuples. The RethrownJobException
- has a list of exceptions like
- e.job_exceptions = [(5-tuple), (5-tuple), ...]
-
- ruffus < 2.7.0 had a bug with exception marshalling that would give
- different output whether the main or child process raised the exception.
- We no longer support this.
-
- Attempting to log the exception itself will re-marshall it to the logger
- which is normally running in another process. It's better to avoid re-
- marshalling.
-
- The exit code will be based on this, even if multiple exceptions occurred
- at the same time."""
-
- exit_codes = []
- for exc in exceptions:
- exit_code = do_ruffus_exception(exc, options, log)
- exit_codes.append(exit_code)
-
- return exit_codes[0] # Multiple codes are rare so take the first one
-
-
def check_closed_streams(options):
"""Work around Python issue with multiprocessing forking on closed streams
@@ -516,10 +413,7 @@ def check_requested_output_file(options, _log):
raise BadArgsError()
elif not is_file_writable(options.output_file):
_log.error(
- "Output file location ("
- + options.output_file
- + ") "
- + "is not a writable file."
+ "Output file location (" + options.output_file + ") is not a writable file."
)
raise OutputFileAccessError()
@@ -594,109 +488,3 @@ def check_dependency_versions(options, log):
version_checker=qpdf.version,
need_version='8.0.2',
)
-
-
-def run_pipeline(options):
- options.verbose_abbreviated_path = 1
- if os.environ.get('_OCRMYPDF_THREADS'):
- options.use_threads = True
-
- if not check_closed_streams(options):
- return ExitCode.bad_args
-
- logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
-
- _log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
- logging_factory, __name__, logger_args
- )
- preamble(_log)
- check_code = check_options(options, _log)
- if check_code != ExitCode.ok:
- return check_code
- check_dependency_versions(options, _log)
-
- # Any changes to options will not take effect for options that are already
- # bound to function parameters in the pipeline. (For example
- # options.input_file, options.pdf_renderer are already bound.)
- if not options.jobs:
- options.jobs = available_cpu_count()
-
- # Performance is improved by setting Tesseract to single threaded. In tests
- # this gives better throughput than letting a smaller number of Tesseract
- # jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
- # variable, but harmless to set if ignored.
- os.environ.setdefault('OMP_THREAD_LIMIT', '1')
-
- check_environ(options, _log)
- if os.environ.get('PYTEST_CURRENT_TEST'):
- os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
-
- try:
- work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
- options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
- start_input_file = os.path.join(work_folder, 'origin')
-
- check_input_file(options, _log, start_input_file)
- check_requested_output_file(options, _log)
-
- manager = JobContextManager()
- manager.register('JobContext', JobContext) # pylint: disable=no-member
- manager.start()
-
- context = manager.JobContext() # pylint: disable=no-member
- context.set_options(options)
- context.set_work_folder(work_folder)
-
- build_pipeline(options, work_folder, _log, context)
- atexit.register(cleanup_working_files, work_folder, options)
- if hasattr(os, 'nice'):
- os.nice(5)
- cmdline.run(options)
- except ruffus_exceptions.RethrownJobError as e:
- if options.verbose:
- _log.debug(str(e)) # stringify exception so logger doesn't have to
- exceptions = e.job_exceptions
- exitcode = traverse_ruffus_exception(exceptions, options, _log)
- if exitcode is None:
- _log.error("Unexpected ruffus exception: " + str(e))
- _log.error(repr(e))
- return ExitCode.other_error
- return exitcode
- except ExitCodeException as e:
- return e.exit_code
- except Exception as e:
- _log.error(str(e))
- return ExitCode.other_error
-
- if options.flowchart:
- _log.info(f"Flowchart saved to {options.flowchart}")
- return ExitCode.ok
- elif options.output_file == '-':
- _log.info("Output sent to stdout")
- elif os.path.samefile(options.output_file, os.devnull):
- pass # Say nothing when sending to dev null
- else:
- if options.output_type.startswith('pdfa'):
- pdfa_info = file_claims_pdfa(options.output_file)
- if pdfa_info['pass']:
- msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
- _log.info(msg)
- else:
- msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
- _log.warning(msg)
- return ExitCode.pdfa_conversion_failed
- if not qpdf.check(options.output_file, _log):
- _log.warning('Output file: The generated PDF is INVALID')
- return ExitCode.invalid_output_pdf
-
- report_output_file_size(options, _log, start_input_file, options.output_file)
-
- pdfinfo = context.get_pdfinfo()
- if options.verbose:
- from pprint import pformat
-
- _log.debug(pformat(pdfinfo))
-
- log_page_orientations(pdfinfo, _log)
-
- return ExitCode.ok
diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py
index 9d5b60df..213ef7d0 100644
--- a/tests/test_unpaper.py
+++ b/tests/test_unpaper.py
@@ -15,15 +15,13 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see .
-import argparse
from os import fspath
-from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from ocrmypdf.__main__ import parser
-from ocrmypdf.run import check_options
+from ocrmypdf._validation import check_options
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.exec import unpaper