mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-06 13:47:41 -04:00
refactor: move ruffus related code to one file
This commit is contained in:
@@ -44,4 +44,3 @@ from . import hocrtransform
|
||||
from . import leptonica
|
||||
from . import pdfa
|
||||
from . import pdfinfo
|
||||
from . import run
|
||||
|
||||
@@ -21,7 +21,7 @@ import os
|
||||
import sys
|
||||
|
||||
from . import PROGRAM_NAME, VERSION
|
||||
from .run import run_pipeline
|
||||
from ._ruffus import run_pipeline
|
||||
|
||||
# Hack to help debugger context find /usr/local/bin
|
||||
if 'IDE_PROJECT_ROOTS' in os.environ:
|
||||
@@ -30,6 +30,7 @@ if 'IDE_PROJECT_ROOTS' in os.environ:
|
||||
# -------------
|
||||
# Parser
|
||||
|
||||
|
||||
def numeric(basetype, min_=None, max_=None):
|
||||
"""Validator for numeric params"""
|
||||
min_ = basetype(min_) if min_ is not None else None
|
||||
@@ -465,9 +466,11 @@ debugging.add_argument(
|
||||
'--flowchart', type=str, help="Generate the pipeline execution flowchart"
|
||||
)
|
||||
|
||||
|
||||
def run(args=None):
|
||||
options = parser.parse_args(args=args)
|
||||
return run_pipeline(options)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(run())
|
||||
|
||||
@@ -25,13 +25,11 @@ from shutil import copyfile, copyfileobj
|
||||
|
||||
import img2pdf
|
||||
from PIL import Image
|
||||
from ruffus import Pipeline, formatter, regex, suffix
|
||||
|
||||
import pikepdf
|
||||
from pikepdf.models.metadata import encode_pdf_date
|
||||
|
||||
from . import PROGRAM_NAME, VERSION, leptonica
|
||||
from ._weave import weave_layers
|
||||
from .exceptions import (
|
||||
DpiError,
|
||||
EncryptedPdfError,
|
||||
@@ -40,7 +38,12 @@ from .exceptions import (
|
||||
UnsupportedImageFormatError,
|
||||
)
|
||||
from .exec import ghostscript, tesseract
|
||||
from .helpers import flatten_groups, is_iterable_notstr, page_number, re_symlink
|
||||
from .helpers import (
|
||||
flatten_groups,
|
||||
is_iterable_notstr,
|
||||
page_number,
|
||||
re_symlink
|
||||
)
|
||||
from .hocrtransform import HocrTransform
|
||||
from .optimize import optimize
|
||||
from .pdfa import generate_pdfa_ps
|
||||
@@ -115,7 +118,10 @@ def triage_image_file(input_file, output_file, log, options):
|
||||
)
|
||||
with open(output_file, 'wb') as outf:
|
||||
img2pdf.convert(
|
||||
input_file, layout_fun=layout_fun, with_pdfrw=False, outputstream=outf
|
||||
input_file,
|
||||
layout_fun=layout_fun,
|
||||
with_pdfrw=False,
|
||||
outputstream=outf
|
||||
)
|
||||
log.info("Successfully converted to PDF, processing...")
|
||||
except img2pdf.ImageOpenError as e:
|
||||
@@ -170,7 +176,7 @@ def repair_and_parse_pdf(input_file, output_file, log, context):
|
||||
pdfinfo = PdfInfo(
|
||||
output_file, detailed_page_analysis=detailed_page_analysis, log=log
|
||||
)
|
||||
except pikepdf.PasswordError as e:
|
||||
except pikepdf.PasswordError:
|
||||
raise EncryptedPdfError()
|
||||
except pikepdf.PdfError as e:
|
||||
log.error(e)
|
||||
@@ -202,7 +208,8 @@ def repair_and_parse_pdf(input_file, output_file, log, context):
|
||||
raise PriorOcrFoundError()
|
||||
else:
|
||||
log.warning(
|
||||
"This PDF has a fillable form. Chances are it is a pure digital "
|
||||
"This PDF has a fillable form. "
|
||||
"Chances are it is a pure digital "
|
||||
"document that does not need OCR."
|
||||
)
|
||||
if not options.force_ocr:
|
||||
@@ -281,8 +288,7 @@ def is_ocr_required(pageinfo, log, options):
|
||||
elif options.redo_ocr:
|
||||
if pageinfo.has_corrupt_text:
|
||||
log.warning(
|
||||
prefix
|
||||
+ (
|
||||
prefix + (
|
||||
"some text on this page cannot be mapped to characters: "
|
||||
"consider using --force-ocr instead",
|
||||
)
|
||||
@@ -936,232 +942,3 @@ def copy_final(input_files, output_file, log, context):
|
||||
# get the appropriate umask, ownership, etc.
|
||||
with open(output_file, 'wb') as output_stream:
|
||||
copyfileobj(input_stream, output_stream)
|
||||
|
||||
|
||||
def build_pipeline(options, work_folder, log, context):
|
||||
main_pipeline = Pipeline.pipelines['main']
|
||||
|
||||
# Triage
|
||||
task_triage = main_pipeline.transform(
|
||||
task_func=triage,
|
||||
input=os.path.join(work_folder, 'origin'),
|
||||
filter=formatter('(?i)'),
|
||||
output=os.path.join(work_folder, 'origin.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_repair_and_parse_pdf = main_pipeline.transform(
|
||||
task_func=repair_and_parse_pdf,
|
||||
input=task_triage,
|
||||
filter=suffix('.pdf'),
|
||||
output='.repaired.pdf',
|
||||
output_dir=work_folder,
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Split (kwargs for split seems to be broken, so pass plain args)
|
||||
task_marker_pages = main_pipeline.split(
|
||||
marker_pages,
|
||||
task_repair_and_parse_pdf,
|
||||
os.path.join(work_folder, '*.marker.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_ocr_or_skip = main_pipeline.split(
|
||||
ocr_or_skip,
|
||||
task_marker_pages,
|
||||
[
|
||||
os.path.join(work_folder, '*.ocr.page.pdf'),
|
||||
os.path.join(work_folder, '*.skip.page.pdf'),
|
||||
],
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Rasterize preview
|
||||
task_rasterize_preview = main_pipeline.transform(
|
||||
task_func=rasterize_preview,
|
||||
input=task_ocr_or_skip,
|
||||
filter=suffix('.page.pdf'),
|
||||
output='.preview.jpg',
|
||||
output_dir=work_folder,
|
||||
extras=[log, context],
|
||||
)
|
||||
task_rasterize_preview.active_if(options.rotate_pages)
|
||||
|
||||
# Orient
|
||||
task_orient_page = main_pipeline.collate(
|
||||
task_func=orient_page,
|
||||
input=[task_ocr_or_skip, task_rasterize_preview],
|
||||
filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
|
||||
output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Rasterize actual
|
||||
task_rasterize_with_ghostscript = main_pipeline.transform(
|
||||
task_func=rasterize_with_ghostscript,
|
||||
input=task_orient_page,
|
||||
filter=suffix('.ocr.oriented.pdf'),
|
||||
output='.page.png',
|
||||
output_dir=work_folder,
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Preprocessing subpipeline
|
||||
task_preprocess_remove_background = main_pipeline.transform(
|
||||
task_func=preprocess_remove_background,
|
||||
input=task_rasterize_with_ghostscript,
|
||||
filter=suffix(".page.png"),
|
||||
output=".pp-background.png",
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_preprocess_deskew = main_pipeline.transform(
|
||||
task_func=preprocess_deskew,
|
||||
input=task_preprocess_remove_background,
|
||||
filter=suffix(".pp-background.png"),
|
||||
output=".pp-deskew.png",
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_preprocess_clean = main_pipeline.transform(
|
||||
task_func=preprocess_clean,
|
||||
input=task_preprocess_deskew,
|
||||
filter=suffix(".pp-deskew.png"),
|
||||
output=".pp-clean.png",
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_select_ocr_image = main_pipeline.collate(
|
||||
task_func=select_ocr_image,
|
||||
input=[task_preprocess_clean],
|
||||
filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
|
||||
output=os.path.join(work_folder, r"\1.ocr.png"),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# HOCR OCR
|
||||
task_ocr_tesseract_hocr = main_pipeline.transform(
|
||||
task_func=ocr_tesseract_hocr,
|
||||
input=task_select_ocr_image,
|
||||
filter=suffix(".ocr.png"),
|
||||
output=[".hocr", ".txt"],
|
||||
extras=[log, context],
|
||||
)
|
||||
task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
|
||||
task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
|
||||
|
||||
task_select_visible_page_image = main_pipeline.collate(
|
||||
task_func=select_visible_page_image,
|
||||
input=[
|
||||
task_rasterize_with_ghostscript,
|
||||
task_preprocess_remove_background,
|
||||
task_preprocess_deskew,
|
||||
task_preprocess_clean,
|
||||
],
|
||||
filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
|
||||
output=os.path.join(work_folder, r'\1.image'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_select_visible_page_image.graphviz(shape='diamond')
|
||||
|
||||
task_select_image_layer = main_pipeline.collate(
|
||||
task_func=select_image_layer,
|
||||
input=[task_select_visible_page_image, task_orient_page],
|
||||
filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
|
||||
output=os.path.join(work_folder, r'\1.image-layer.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond')
|
||||
|
||||
task_render_hocr_page = main_pipeline.transform(
|
||||
task_func=render_hocr_page,
|
||||
input=task_ocr_tesseract_hocr,
|
||||
filter=regex(r".*/(\d{6})(?:\.hocr)"),
|
||||
output=os.path.join(work_folder, r'\1.text.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
|
||||
task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
|
||||
|
||||
# Tesseract OCR + text only PDF
|
||||
task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
|
||||
task_func=ocr_tesseract_textonly_pdf,
|
||||
input=[task_select_ocr_image],
|
||||
filter=regex(r".*/(\d{6})(?:\.ocr.png)"),
|
||||
output=[
|
||||
os.path.join(work_folder, r'\1.text.pdf'),
|
||||
os.path.join(work_folder, r'\1.text.txt'),
|
||||
],
|
||||
extras=[log, context],
|
||||
)
|
||||
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
|
||||
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
|
||||
|
||||
task_weave_layers = main_pipeline.collate(
|
||||
task_func=weave_layers,
|
||||
input=[
|
||||
task_repair_and_parse_pdf,
|
||||
task_render_hocr_page,
|
||||
task_ocr_tesseract_textonly_pdf,
|
||||
task_select_image_layer,
|
||||
],
|
||||
filter=regex(
|
||||
r".*/((?:\d{6}(?:\.text\.pdf|\.image-layer\.pdf))|(?:origin\.repaired\.pdf))"
|
||||
),
|
||||
output=os.path.join(work_folder, r'layers.rendered.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_weave_layers.graphviz(fillcolor='"#00cc66"')
|
||||
|
||||
# PDF/A pdfmark
|
||||
task_generate_postscript_stub = main_pipeline.transform(
|
||||
task_func=generate_postscript_stub,
|
||||
input=task_repair_and_parse_pdf,
|
||||
filter=formatter(r'\.repaired\.pdf'),
|
||||
output=os.path.join(work_folder, 'pdfa.ps'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa'))
|
||||
|
||||
# PDF/A conversion
|
||||
task_convert_to_pdfa = main_pipeline.merge(
|
||||
task_func=convert_to_pdfa,
|
||||
input=[task_generate_postscript_stub, task_weave_layers],
|
||||
output=os.path.join(work_folder, 'pdfa.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_convert_to_pdfa.active_if(options.output_type.startswith('pdfa'))
|
||||
|
||||
task_metadata_fixup = main_pipeline.merge(
|
||||
task_func=metadata_fixup,
|
||||
input=[task_repair_and_parse_pdf, task_weave_layers, task_convert_to_pdfa],
|
||||
output=os.path.join(work_folder, 'metafix.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_merge_sidecars = main_pipeline.merge(
|
||||
task_func=merge_sidecars,
|
||||
input=[task_ocr_tesseract_hocr, task_ocr_tesseract_textonly_pdf],
|
||||
output=options.sidecar,
|
||||
extras=[log, context],
|
||||
)
|
||||
task_merge_sidecars.active_if(options.sidecar)
|
||||
|
||||
# Optimize
|
||||
task_optimize_pdf = main_pipeline.transform(
|
||||
task_func=optimize_pdf,
|
||||
input=task_metadata_fixup,
|
||||
filter=suffix('.pdf'),
|
||||
output='.optimized.pdf',
|
||||
output_dir=work_folder,
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Finalize
|
||||
main_pipeline.merge(
|
||||
task_func=copy_final,
|
||||
input=[task_optimize_pdf],
|
||||
output=options.output_file,
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
508
src/ocrmypdf/_ruffus.py
Normal file
508
src/ocrmypdf/_ruffus.py
Normal file
@@ -0,0 +1,508 @@
|
||||
# © 2016 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import atexit
|
||||
from tempfile import mkdtemp
|
||||
from ruffus import (
|
||||
Pipeline,
|
||||
formatter,
|
||||
regex,
|
||||
suffix,
|
||||
cmdline,
|
||||
proxy_logger,
|
||||
ruffus_exceptions
|
||||
)
|
||||
from .exec import qpdf
|
||||
from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
|
||||
from ._weave import weave_layers
|
||||
from ._pipeline import (
|
||||
triage,
|
||||
repair_and_parse_pdf,
|
||||
marker_pages,
|
||||
ocr_or_skip,
|
||||
rasterize_preview,
|
||||
orient_page,
|
||||
rasterize_with_ghostscript,
|
||||
preprocess_remove_background,
|
||||
preprocess_deskew,
|
||||
preprocess_clean,
|
||||
select_ocr_image,
|
||||
ocr_tesseract_hocr,
|
||||
select_visible_page_image,
|
||||
select_image_layer,
|
||||
render_hocr_page,
|
||||
ocr_tesseract_textonly_pdf,
|
||||
generate_postscript_stub,
|
||||
convert_to_pdfa,
|
||||
metadata_fixup,
|
||||
merge_sidecars,
|
||||
optimize_pdf,
|
||||
copy_final
|
||||
)
|
||||
from . import exceptions as ocrmypdf_exceptions
|
||||
from .exceptions import (
|
||||
ExitCode,
|
||||
ExitCodeException,
|
||||
)
|
||||
from .helpers import available_cpu_count
|
||||
from .pdfa import file_claims_pdfa
|
||||
from ._validation import (
|
||||
check_closed_streams,
|
||||
preamble,
|
||||
check_options,
|
||||
check_dependency_versions,
|
||||
check_environ,
|
||||
check_input_file,
|
||||
check_requested_output_file,
|
||||
report_output_file_size,
|
||||
log_page_orientations,
|
||||
logging_factory,
|
||||
)
|
||||
|
||||
|
||||
def cleanup_ruffus_error_message(msg):
|
||||
msg = re.sub(r'\s+', r' ', msg)
|
||||
msg = re.sub(r"\((.+?)\)", r'\1', msg)
|
||||
msg = msg.strip()
|
||||
return msg
|
||||
|
||||
|
||||
def do_ruffus_exception(ruffus_five_tuple, options, log):
|
||||
"""Replace the elaborate ruffus stack trace with a user friendly
|
||||
description of the error message that occurred."""
|
||||
exit_code = None
|
||||
|
||||
_task_name, _job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
|
||||
|
||||
if isinstance(exc_name, type):
|
||||
# ruffus is full of mystery... sometimes (probably when the process
|
||||
# group leader is killed) exc_name is the class object of the exception,
|
||||
# rather than a str. So reach into the object and get its name.
|
||||
exc_name = exc_name.__name__
|
||||
|
||||
if exc_name.startswith('ocrmypdf.exceptions.'):
|
||||
base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
|
||||
exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
|
||||
exit_code = getattr(exc_class, 'exit_code', ExitCode.other_error)
|
||||
try:
|
||||
if isinstance(exc_value, exc_class):
|
||||
exc_msg = str(exc_value)
|
||||
elif isinstance(exc_value, str):
|
||||
exc_msg = exc_value
|
||||
else:
|
||||
exc_msg = str(exc_class())
|
||||
except Exception:
|
||||
exc_msg = "Unknown"
|
||||
|
||||
if exc_name in ('builtins.SystemExit', 'SystemExit'):
|
||||
match = re.search(r"\.(.+?)\)", exc_value)
|
||||
exit_code_name = match.groups()[0]
|
||||
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
|
||||
elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
|
||||
log.error(cleanup_ruffus_error_message(exc_value))
|
||||
exit_code = ExitCode.input_file
|
||||
elif exc_name in ('builtins.KeyboardInterrupt', 'KeyboardInterrupt'):
|
||||
# We have to print in this case because the log daemon might be toast
|
||||
print("Interrupted by user", file=sys.stderr)
|
||||
exit_code = ExitCode.ctrl_c
|
||||
elif exc_name == 'subprocess.CalledProcessError':
|
||||
# It's up to the subprocess handler to report something useful
|
||||
msg = "Error occurred while running this command:"
|
||||
log.error(msg + '\n' + exc_value)
|
||||
exit_code = ExitCode.child_process_error
|
||||
elif exc_name.startswith('ocrmypdf.exceptions.'):
|
||||
if exc_msg:
|
||||
log.error(exc_msg)
|
||||
elif exc_name == 'PIL.Image.DecompressionBombError':
|
||||
msg = cleanup_ruffus_error_message(exc_value)
|
||||
msg += (
|
||||
"\nUse the --max-image-mpixels argument to set increase the "
|
||||
"maximum number of megapixels to accept."
|
||||
)
|
||||
log.error(msg)
|
||||
exit_code = ExitCode.input_file
|
||||
|
||||
if exit_code is not None:
|
||||
return exit_code
|
||||
|
||||
if not options.verbose:
|
||||
log.error(exc_stack)
|
||||
return ExitCode.other_error
|
||||
|
||||
|
||||
def traverse_ruffus_exception(exceptions, options, log):
|
||||
"""Traverse a RethrownJobError and output the exceptions
|
||||
|
||||
Ruffus presents exceptions as 5 element tuples. The RethrownJobException
|
||||
has a list of exceptions like
|
||||
e.job_exceptions = [(5-tuple), (5-tuple), ...]
|
||||
|
||||
ruffus < 2.7.0 had a bug with exception marshalling that would give
|
||||
different output whether the main or child process raised the exception.
|
||||
We no longer support this.
|
||||
|
||||
Attempting to log the exception itself will re-marshall it to the logger
|
||||
which is normally running in another process. It's better to avoid re-
|
||||
marshalling.
|
||||
|
||||
The exit code will be based on this, even if multiple exceptions occurred
|
||||
at the same time."""
|
||||
|
||||
exit_codes = []
|
||||
for exc in exceptions:
|
||||
exit_code = do_ruffus_exception(exc, options, log)
|
||||
exit_codes.append(exit_code)
|
||||
|
||||
return exit_codes[0] # Multiple codes are rare so take the first one
|
||||
|
||||
|
||||
def build_pipeline(options, work_folder, log, context):
|
||||
main_pipeline = Pipeline.pipelines['main']
|
||||
|
||||
# Triage
|
||||
task_triage = main_pipeline.transform(
|
||||
task_func=triage,
|
||||
input=os.path.join(work_folder, 'origin'),
|
||||
filter=formatter('(?i)'),
|
||||
output=os.path.join(work_folder, 'origin.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_repair_and_parse_pdf = main_pipeline.transform(
|
||||
task_func=repair_and_parse_pdf,
|
||||
input=task_triage,
|
||||
filter=suffix('.pdf'),
|
||||
output='.repaired.pdf',
|
||||
output_dir=work_folder,
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Split (kwargs for split seems to be broken, so pass plain args)
|
||||
task_marker_pages = main_pipeline.split(
|
||||
marker_pages,
|
||||
task_repair_and_parse_pdf,
|
||||
os.path.join(work_folder, '*.marker.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_ocr_or_skip = main_pipeline.split(
|
||||
ocr_or_skip,
|
||||
task_marker_pages,
|
||||
[
|
||||
os.path.join(work_folder, '*.ocr.page.pdf'),
|
||||
os.path.join(work_folder, '*.skip.page.pdf'),
|
||||
],
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Rasterize preview
|
||||
task_rasterize_preview = main_pipeline.transform(
|
||||
task_func=rasterize_preview,
|
||||
input=task_ocr_or_skip,
|
||||
filter=suffix('.page.pdf'),
|
||||
output='.preview.jpg',
|
||||
output_dir=work_folder,
|
||||
extras=[log, context],
|
||||
)
|
||||
task_rasterize_preview.active_if(options.rotate_pages)
|
||||
|
||||
# Orient
|
||||
task_orient_page = main_pipeline.collate(
|
||||
task_func=orient_page,
|
||||
input=[task_ocr_or_skip, task_rasterize_preview],
|
||||
filter=regex(r".*/(\d{6})(\.ocr|\.skip)(?:\.page\.pdf|\.preview\.jpg)"),
|
||||
output=os.path.join(work_folder, r'\1\2.oriented.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Rasterize actual
|
||||
task_rasterize_with_ghostscript = main_pipeline.transform(
|
||||
task_func=rasterize_with_ghostscript,
|
||||
input=task_orient_page,
|
||||
filter=suffix('.ocr.oriented.pdf'),
|
||||
output='.page.png',
|
||||
output_dir=work_folder,
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Preprocessing subpipeline
|
||||
task_preprocess_remove_background = main_pipeline.transform(
|
||||
task_func=preprocess_remove_background,
|
||||
input=task_rasterize_with_ghostscript,
|
||||
filter=suffix(".page.png"),
|
||||
output=".pp-background.png",
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_preprocess_deskew = main_pipeline.transform(
|
||||
task_func=preprocess_deskew,
|
||||
input=task_preprocess_remove_background,
|
||||
filter=suffix(".pp-background.png"),
|
||||
output=".pp-deskew.png",
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_preprocess_clean = main_pipeline.transform(
|
||||
task_func=preprocess_clean,
|
||||
input=task_preprocess_deskew,
|
||||
filter=suffix(".pp-deskew.png"),
|
||||
output=".pp-clean.png",
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_select_ocr_image = main_pipeline.collate(
|
||||
task_func=select_ocr_image,
|
||||
input=[task_preprocess_clean],
|
||||
filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
|
||||
output=os.path.join(work_folder, r"\1.ocr.png"),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# HOCR OCR
|
||||
task_ocr_tesseract_hocr = main_pipeline.transform(
|
||||
task_func=ocr_tesseract_hocr,
|
||||
input=task_select_ocr_image,
|
||||
filter=suffix(".ocr.png"),
|
||||
output=[".hocr", ".txt"],
|
||||
extras=[log, context],
|
||||
)
|
||||
task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
|
||||
task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
|
||||
|
||||
task_select_visible_page_image = main_pipeline.collate(
|
||||
task_func=select_visible_page_image,
|
||||
input=[
|
||||
task_rasterize_with_ghostscript,
|
||||
task_preprocess_remove_background,
|
||||
task_preprocess_deskew,
|
||||
task_preprocess_clean,
|
||||
],
|
||||
filter=regex(r".*/(\d{6})(?:\.page|\.pp-.*)\.png"),
|
||||
output=os.path.join(work_folder, r'\1.image'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_select_visible_page_image.graphviz(shape='diamond')
|
||||
|
||||
task_select_image_layer = main_pipeline.collate(
|
||||
task_func=select_image_layer,
|
||||
input=[task_select_visible_page_image, task_orient_page],
|
||||
filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
|
||||
output=os.path.join(work_folder, r'\1.image-layer.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_select_image_layer.graphviz(fillcolor='"#00cc66"', shape='diamond')
|
||||
|
||||
task_render_hocr_page = main_pipeline.transform(
|
||||
task_func=render_hocr_page,
|
||||
input=task_ocr_tesseract_hocr,
|
||||
filter=regex(r".*/(\d{6})(?:\.hocr)"),
|
||||
output=os.path.join(work_folder, r'\1.text.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
|
||||
task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
|
||||
|
||||
# Tesseract OCR + text only PDF
|
||||
task_ocr_tesseract_textonly_pdf = main_pipeline.collate(
|
||||
task_func=ocr_tesseract_textonly_pdf,
|
||||
input=[task_select_ocr_image],
|
||||
filter=regex(r".*/(\d{6})(?:\.ocr.png)"),
|
||||
output=[
|
||||
os.path.join(work_folder, r'\1.text.pdf'),
|
||||
os.path.join(work_folder, r'\1.text.txt'),
|
||||
],
|
||||
extras=[log, context],
|
||||
)
|
||||
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
|
||||
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'sandwich')
|
||||
|
||||
task_weave_layers = main_pipeline.collate(
|
||||
task_func=weave_layers,
|
||||
input=[
|
||||
task_repair_and_parse_pdf,
|
||||
task_render_hocr_page,
|
||||
task_ocr_tesseract_textonly_pdf,
|
||||
task_select_image_layer,
|
||||
],
|
||||
filter=regex(
|
||||
r".*/((?:\d{6}(?:\.text\.pdf|\.image-layer\.pdf))|(?:origin\.repaired\.pdf))"
|
||||
),
|
||||
output=os.path.join(work_folder, r'layers.rendered.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_weave_layers.graphviz(fillcolor='"#00cc66"')
|
||||
|
||||
# PDF/A pdfmark
|
||||
task_generate_postscript_stub = main_pipeline.transform(
|
||||
task_func=generate_postscript_stub,
|
||||
input=task_repair_and_parse_pdf,
|
||||
filter=formatter(r'\.repaired\.pdf'),
|
||||
output=os.path.join(work_folder, 'pdfa.ps'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_generate_postscript_stub.active_if(options.output_type.startswith('pdfa'))
|
||||
|
||||
# PDF/A conversion
|
||||
task_convert_to_pdfa = main_pipeline.merge(
|
||||
task_func=convert_to_pdfa,
|
||||
input=[task_generate_postscript_stub, task_weave_layers],
|
||||
output=os.path.join(work_folder, 'pdfa.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
task_convert_to_pdfa.active_if(options.output_type.startswith('pdfa'))
|
||||
|
||||
task_metadata_fixup = main_pipeline.merge(
|
||||
task_func=metadata_fixup,
|
||||
input=[task_repair_and_parse_pdf, task_weave_layers, task_convert_to_pdfa],
|
||||
output=os.path.join(work_folder, 'metafix.pdf'),
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
task_merge_sidecars = main_pipeline.merge(
|
||||
task_func=merge_sidecars,
|
||||
input=[task_ocr_tesseract_hocr, task_ocr_tesseract_textonly_pdf],
|
||||
output=options.sidecar,
|
||||
extras=[log, context],
|
||||
)
|
||||
task_merge_sidecars.active_if(options.sidecar)
|
||||
|
||||
# Optimize
|
||||
task_optimize_pdf = main_pipeline.transform(
|
||||
task_func=optimize_pdf,
|
||||
input=task_metadata_fixup,
|
||||
filter=suffix('.pdf'),
|
||||
output='.optimized.pdf',
|
||||
output_dir=work_folder,
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
# Finalize
|
||||
main_pipeline.merge(
|
||||
task_func=copy_final,
|
||||
input=[task_optimize_pdf],
|
||||
output=options.output_file,
|
||||
extras=[log, context],
|
||||
)
|
||||
|
||||
|
||||
def run_pipeline(options):
|
||||
options.verbose_abbreviated_path = 1
|
||||
if os.environ.get('_OCRMYPDF_THREADS'):
|
||||
options.use_threads = True
|
||||
|
||||
if not check_closed_streams(options):
|
||||
return ExitCode.bad_args
|
||||
|
||||
logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
|
||||
|
||||
_log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
|
||||
logging_factory, __name__, logger_args
|
||||
)
|
||||
preamble(_log)
|
||||
check_code = check_options(options, _log)
|
||||
if check_code != ExitCode.ok:
|
||||
return check_code
|
||||
check_dependency_versions(options, _log)
|
||||
|
||||
# Any changes to options will not take effect for options that are already
|
||||
# bound to function parameters in the pipeline. (For example
|
||||
# options.input_file, options.pdf_renderer are already bound.)
|
||||
if not options.jobs:
|
||||
options.jobs = available_cpu_count()
|
||||
|
||||
# Performance is improved by setting Tesseract to single threaded. In tests
|
||||
# this gives better throughput than letting a smaller number of Tesseract
|
||||
# jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
|
||||
# variable, but harmless to set if ignored.
|
||||
os.environ.setdefault('OMP_THREAD_LIMIT', '1')
|
||||
|
||||
check_environ(options, _log)
|
||||
if os.environ.get('PYTEST_CURRENT_TEST'):
|
||||
os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
|
||||
|
||||
try:
|
||||
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
|
||||
options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
|
||||
start_input_file = os.path.join(work_folder, 'origin')
|
||||
|
||||
check_input_file(options, _log, start_input_file)
|
||||
check_requested_output_file(options, _log)
|
||||
|
||||
manager = JobContextManager()
|
||||
manager.register('JobContext', JobContext) # pylint: disable=no-member
|
||||
manager.start()
|
||||
|
||||
context = manager.JobContext() # pylint: disable=no-member
|
||||
context.set_options(options)
|
||||
context.set_work_folder(work_folder)
|
||||
|
||||
build_pipeline(options, work_folder, _log, context)
|
||||
atexit.register(cleanup_working_files, work_folder, options)
|
||||
if hasattr(os, 'nice'):
|
||||
os.nice(5)
|
||||
cmdline.run(options)
|
||||
except ruffus_exceptions.RethrownJobError as e:
|
||||
if options.verbose:
|
||||
_log.debug(str(e)) # stringify exception so logger doesn't have to
|
||||
exceptions = e.job_exceptions
|
||||
exitcode = traverse_ruffus_exception(exceptions, options, _log)
|
||||
if exitcode is None:
|
||||
_log.error("Unexpected ruffus exception: " + str(e))
|
||||
_log.error(repr(e))
|
||||
return ExitCode.other_error
|
||||
return exitcode
|
||||
except ExitCodeException as e:
|
||||
return e.exit_code
|
||||
except Exception as e:
|
||||
_log.error(str(e))
|
||||
return ExitCode.other_error
|
||||
|
||||
if options.flowchart:
|
||||
_log.info(f"Flowchart saved to {options.flowchart}")
|
||||
return ExitCode.ok
|
||||
elif options.output_file == '-':
|
||||
_log.info("Output sent to stdout")
|
||||
elif os.path.samefile(options.output_file, os.devnull):
|
||||
pass # Say nothing when sending to dev null
|
||||
else:
|
||||
if options.output_type.startswith('pdfa'):
|
||||
pdfa_info = file_claims_pdfa(options.output_file)
|
||||
if pdfa_info['pass']:
|
||||
msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
|
||||
_log.info(msg)
|
||||
else:
|
||||
msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
|
||||
_log.warning(msg)
|
||||
return ExitCode.pdfa_conversion_failed
|
||||
if not qpdf.check(options.output_file, _log):
|
||||
_log.warning('Output file: The generated PDF is INVALID')
|
||||
return ExitCode.invalid_output_pdf
|
||||
|
||||
report_output_file_size(options, _log, start_input_file, options.output_file)
|
||||
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
if options.verbose:
|
||||
from pprint import pformat
|
||||
|
||||
_log.debug(pformat(pdfinfo))
|
||||
|
||||
log_page_orientations(pdfinfo, _log)
|
||||
|
||||
return ExitCode.ok
|
||||
@@ -16,33 +16,20 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import atexit
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
import sys
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
|
||||
import PIL
|
||||
import ruffus.cmdline as cmdline
|
||||
import ruffus.proxy_logger as proxy_logger
|
||||
import ruffus.ruffus_exceptions as ruffus_exceptions
|
||||
|
||||
from . import VERSION
|
||||
from . import exceptions as ocrmypdf_exceptions
|
||||
from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
|
||||
from ._pipeline import build_pipeline
|
||||
|
||||
from ._unicodefun import verify_python3_env
|
||||
from .exceptions import (
|
||||
BadArgsError,
|
||||
ExitCode,
|
||||
ExitCodeException,
|
||||
InputFileError,
|
||||
MissingDependencyError,
|
||||
OutputFileAccessError,
|
||||
)
|
||||
|
||||
from .exec import (
|
||||
ghostscript,
|
||||
jbig2enc,
|
||||
@@ -52,8 +39,14 @@ from .exec import (
|
||||
unpaper,
|
||||
pngquant,
|
||||
)
|
||||
from .helpers import available_cpu_count, is_file_writable, re_symlink
|
||||
from .pdfa import file_claims_pdfa
|
||||
from .helpers import is_file_writable, re_symlink
|
||||
from .exceptions import (
|
||||
BadArgsError,
|
||||
ExitCode,
|
||||
InputFileError,
|
||||
MissingDependencyError,
|
||||
OutputFileAccessError,
|
||||
)
|
||||
|
||||
# -------------
|
||||
# External dependencies
|
||||
@@ -64,9 +57,9 @@ HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
|
||||
def complain(message):
|
||||
print(*textwrap.wrap(message), file=sys.stderr)
|
||||
|
||||
|
||||
# --------
|
||||
# Critical environment tests
|
||||
|
||||
verify_python3_env()
|
||||
|
||||
|
||||
@@ -305,102 +298,6 @@ def logging_factory(logger_name, logger_args):
|
||||
return root_logger
|
||||
|
||||
|
||||
def cleanup_ruffus_error_message(msg):
|
||||
msg = re.sub(r'\s+', r' ', msg)
|
||||
msg = re.sub(r"\((.+?)\)", r'\1', msg)
|
||||
msg = msg.strip()
|
||||
return msg
|
||||
|
||||
|
||||
def do_ruffus_exception(ruffus_five_tuple, options, log):
|
||||
"""Replace the elaborate ruffus stack trace with a user friendly
|
||||
description of the error message that occurred."""
|
||||
exit_code = None
|
||||
|
||||
_task_name, _job_name, exc_name, exc_value, exc_stack = ruffus_five_tuple
|
||||
|
||||
if isinstance(exc_name, type):
|
||||
# ruffus is full of mystery... sometimes (probably when the process
|
||||
# group leader is killed) exc_name is the class object of the exception,
|
||||
# rather than a str. So reach into the object and get its name.
|
||||
exc_name = exc_name.__name__
|
||||
|
||||
if exc_name.startswith('ocrmypdf.exceptions.'):
|
||||
base_exc_name = exc_name.replace('ocrmypdf.exceptions.', '')
|
||||
exc_class = getattr(ocrmypdf_exceptions, base_exc_name)
|
||||
exit_code = getattr(exc_class, 'exit_code', ExitCode.other_error)
|
||||
try:
|
||||
if isinstance(exc_value, exc_class):
|
||||
exc_msg = str(exc_value)
|
||||
elif isinstance(exc_value, str):
|
||||
exc_msg = exc_value
|
||||
else:
|
||||
exc_msg = str(exc_class())
|
||||
except Exception:
|
||||
exc_msg = "Unknown"
|
||||
|
||||
if exc_name in ('builtins.SystemExit', 'SystemExit'):
|
||||
match = re.search(r"\.(.+?)\)", exc_value)
|
||||
exit_code_name = match.groups()[0]
|
||||
exit_code = getattr(ExitCode, exit_code_name, 'other_error')
|
||||
elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError':
|
||||
log.error(cleanup_ruffus_error_message(exc_value))
|
||||
exit_code = ExitCode.input_file
|
||||
elif exc_name in ('builtins.KeyboardInterrupt', 'KeyboardInterrupt'):
|
||||
# We have to print in this case because the log daemon might be toast
|
||||
print("Interrupted by user", file=sys.stderr)
|
||||
exit_code = ExitCode.ctrl_c
|
||||
elif exc_name == 'subprocess.CalledProcessError':
|
||||
# It's up to the subprocess handler to report something useful
|
||||
msg = "Error occurred while running this command:"
|
||||
log.error(msg + '\n' + exc_value)
|
||||
exit_code = ExitCode.child_process_error
|
||||
elif exc_name.startswith('ocrmypdf.exceptions.'):
|
||||
if exc_msg:
|
||||
log.error(exc_msg)
|
||||
elif exc_name == 'PIL.Image.DecompressionBombError':
|
||||
msg = cleanup_ruffus_error_message(exc_value)
|
||||
msg += (
|
||||
"\nUse the --max-image-mpixels argument to set increase the "
|
||||
"maximum number of megapixels to accept."
|
||||
)
|
||||
log.error(msg)
|
||||
exit_code = ExitCode.input_file
|
||||
|
||||
if exit_code is not None:
|
||||
return exit_code
|
||||
|
||||
if not options.verbose:
|
||||
log.error(exc_stack)
|
||||
return ExitCode.other_error
|
||||
|
||||
|
||||
def traverse_ruffus_exception(exceptions, options, log):
|
||||
"""Traverse a RethrownJobError and output the exceptions
|
||||
|
||||
Ruffus presents exceptions as 5 element tuples. The RethrownJobException
|
||||
has a list of exceptions like
|
||||
e.job_exceptions = [(5-tuple), (5-tuple), ...]
|
||||
|
||||
ruffus < 2.7.0 had a bug with exception marshalling that would give
|
||||
different output whether the main or child process raised the exception.
|
||||
We no longer support this.
|
||||
|
||||
Attempting to log the exception itself will re-marshall it to the logger
|
||||
which is normally running in another process. It's better to avoid re-
|
||||
marshalling.
|
||||
|
||||
The exit code will be based on this, even if multiple exceptions occurred
|
||||
at the same time."""
|
||||
|
||||
exit_codes = []
|
||||
for exc in exceptions:
|
||||
exit_code = do_ruffus_exception(exc, options, log)
|
||||
exit_codes.append(exit_code)
|
||||
|
||||
return exit_codes[0] # Multiple codes are rare so take the first one
|
||||
|
||||
|
||||
def check_closed_streams(options):
|
||||
"""Work around Python issue with multiprocessing forking on closed streams
|
||||
|
||||
@@ -516,10 +413,7 @@ def check_requested_output_file(options, _log):
|
||||
raise BadArgsError()
|
||||
elif not is_file_writable(options.output_file):
|
||||
_log.error(
|
||||
"Output file location ("
|
||||
+ options.output_file
|
||||
+ ") "
|
||||
+ "is not a writable file."
|
||||
"Output file location (" + options.output_file + ") is not a writable file."
|
||||
)
|
||||
raise OutputFileAccessError()
|
||||
|
||||
@@ -594,109 +488,3 @@ def check_dependency_versions(options, log):
|
||||
version_checker=qpdf.version,
|
||||
need_version='8.0.2',
|
||||
)
|
||||
|
||||
|
||||
def run_pipeline(options):
|
||||
options.verbose_abbreviated_path = 1
|
||||
if os.environ.get('_OCRMYPDF_THREADS'):
|
||||
options.use_threads = True
|
||||
|
||||
if not check_closed_streams(options):
|
||||
return ExitCode.bad_args
|
||||
|
||||
logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
|
||||
|
||||
_log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
|
||||
logging_factory, __name__, logger_args
|
||||
)
|
||||
preamble(_log)
|
||||
check_code = check_options(options, _log)
|
||||
if check_code != ExitCode.ok:
|
||||
return check_code
|
||||
check_dependency_versions(options, _log)
|
||||
|
||||
# Any changes to options will not take effect for options that are already
|
||||
# bound to function parameters in the pipeline. (For example
|
||||
# options.input_file, options.pdf_renderer are already bound.)
|
||||
if not options.jobs:
|
||||
options.jobs = available_cpu_count()
|
||||
|
||||
# Performance is improved by setting Tesseract to single threaded. In tests
|
||||
# this gives better throughput than letting a smaller number of Tesseract
|
||||
# jobs run multithreaded. Same story for pngquant. Tess <4 ignores this
|
||||
# variable, but harmless to set if ignored.
|
||||
os.environ.setdefault('OMP_THREAD_LIMIT', '1')
|
||||
|
||||
check_environ(options, _log)
|
||||
if os.environ.get('PYTEST_CURRENT_TEST'):
|
||||
os.environ['_OCRMYPDF_TEST_INFILE'] = options.input_file
|
||||
|
||||
try:
|
||||
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
|
||||
options.history_file = os.path.join(work_folder, 'ruffus_history.sqlite')
|
||||
start_input_file = os.path.join(work_folder, 'origin')
|
||||
|
||||
check_input_file(options, _log, start_input_file)
|
||||
check_requested_output_file(options, _log)
|
||||
|
||||
manager = JobContextManager()
|
||||
manager.register('JobContext', JobContext) # pylint: disable=no-member
|
||||
manager.start()
|
||||
|
||||
context = manager.JobContext() # pylint: disable=no-member
|
||||
context.set_options(options)
|
||||
context.set_work_folder(work_folder)
|
||||
|
||||
build_pipeline(options, work_folder, _log, context)
|
||||
atexit.register(cleanup_working_files, work_folder, options)
|
||||
if hasattr(os, 'nice'):
|
||||
os.nice(5)
|
||||
cmdline.run(options)
|
||||
except ruffus_exceptions.RethrownJobError as e:
|
||||
if options.verbose:
|
||||
_log.debug(str(e)) # stringify exception so logger doesn't have to
|
||||
exceptions = e.job_exceptions
|
||||
exitcode = traverse_ruffus_exception(exceptions, options, _log)
|
||||
if exitcode is None:
|
||||
_log.error("Unexpected ruffus exception: " + str(e))
|
||||
_log.error(repr(e))
|
||||
return ExitCode.other_error
|
||||
return exitcode
|
||||
except ExitCodeException as e:
|
||||
return e.exit_code
|
||||
except Exception as e:
|
||||
_log.error(str(e))
|
||||
return ExitCode.other_error
|
||||
|
||||
if options.flowchart:
|
||||
_log.info(f"Flowchart saved to {options.flowchart}")
|
||||
return ExitCode.ok
|
||||
elif options.output_file == '-':
|
||||
_log.info("Output sent to stdout")
|
||||
elif os.path.samefile(options.output_file, os.devnull):
|
||||
pass # Say nothing when sending to dev null
|
||||
else:
|
||||
if options.output_type.startswith('pdfa'):
|
||||
pdfa_info = file_claims_pdfa(options.output_file)
|
||||
if pdfa_info['pass']:
|
||||
msg = f"Output file is a {pdfa_info['conformance']} (as expected)"
|
||||
_log.info(msg)
|
||||
else:
|
||||
msg = f"Output file is okay but is not PDF/A (seems to be {pdfa_info['conformance']})"
|
||||
_log.warning(msg)
|
||||
return ExitCode.pdfa_conversion_failed
|
||||
if not qpdf.check(options.output_file, _log):
|
||||
_log.warning('Output file: The generated PDF is INVALID')
|
||||
return ExitCode.invalid_output_pdf
|
||||
|
||||
report_output_file_size(options, _log, start_input_file, options.output_file)
|
||||
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
if options.verbose:
|
||||
from pprint import pformat
|
||||
|
||||
_log.debug(pformat(pdfinfo))
|
||||
|
||||
log_page_orientations(pdfinfo, _log)
|
||||
|
||||
return ExitCode.ok
|
||||
@@ -15,15 +15,13 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
from os import fspath
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from ocrmypdf.__main__ import parser
|
||||
from ocrmypdf.run import check_options
|
||||
from ocrmypdf._validation import check_options
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
from ocrmypdf.exec import unpaper
|
||||
|
||||
|
||||
Reference in New Issue
Block a user