From 70aa644c108dd79374dff4dc37050dc5ceb01882 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Mon, 9 Jul 2018 13:56:23 -0700 Subject: [PATCH] Backport Python 3.7 fix for ruffus 2.7.0 from ocrmypdf v7.0.0 --- .gitignore | 1 + docs/release_notes.rst | 6 ++++ requirements.txt | 6 ++-- setup.py | 4 +-- src/ocrmypdf/__main__.py | 67 ++++++++++++++++++++-------------------- 5 files changed, 45 insertions(+), 39 deletions(-) diff --git a/.gitignore b/.gitignore index 3a636a86..f06aefdc 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc *.sublime-* venv*/ +.venv/ pyvenv.cfg tasks.py .bash_history diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 082fdec6..c1575260 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -10,6 +10,12 @@ The OCRmyPDF package itself does not contain a public API, although it is fairly replace: `#$1 `_ +v6.2.2 +------ + +- Backport v7.0.0 fix for ruffus 2.7.0 + + v6.2.1 ------ diff --git a/requirements.txt b/requirements.txt index d7dec8b0..73998a86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ # requirements.txt can be used to replicate the developer's build environment # setup.py lists a separate set of requirements that are looser to simplify # installation -ruffus == 2.6.3 -Pillow == 5.1.0 +ruffus == 2.7.0 +Pillow == 5.2.0 reportlab == 3.4.0 PyPDF2 == 1.26.0 img2pdf == 0.2.4 cffi == 1.11.5 -PyMuPDF == 1.12.5 \ No newline at end of file +PyMuPDF == 1.12.5 diff --git a/setup.py b/setup.py index 5420aa6e..badc576e 100644 --- a/setup.py +++ b/setup.py @@ -243,12 +243,12 @@ setup( 'cffi >= 1.9.1', # must be a setup and install requirement 'defusedxml >= 0.5.0', # pure Python, so track HEAD closely 'img2pdf >= 0.2.4', # pure Python, so track HEAD closely - 'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"', + 'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"', # Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3 # block 5.1.0, broken wheels 'PyPDF2 >= 1.26', # pure Python, so track HEAD closely 'reportlab >= 3.3.0', # oldest released version with sane image handling - 'ruffus == 2.6.3', # pinned - ocrmypdf implements a 2.6.3 workaround + 'ruffus >= 2.7.0', ], extras_require={ 'fitz': ['PyMuPDF >= 1.12.5'] # for table of contents bug diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index aa4490c4..9290daf2 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -70,7 +70,7 @@ def complain(message): if 'IDE_PROJECT_ROOTS' in os.environ: os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH'] -# -------- +# -------- # Critical environment tests verify_python3_env() @@ -156,7 +156,7 @@ parser.add_argument( '--image-dpi', metavar='DPI', type=int, help="For input image instead of PDF, use this DPI instead of file's.") parser.add_argument( - '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], + '--output-type', choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], default='pdfa', help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for " "long term archiving (default, recommended) but may not suitable " @@ -574,7 +574,7 @@ def do_ruffus_exception(ruffus_five_tuple, options, log): if exc_name == 'builtins.SystemExit': match = re.search(r"\.(.+?)\)", exc_value) exit_code_name = match.groups()[0] - exit_code = getattr(ExitCode, exit_code_name, 'other_error') + exit_code = getattr(ExitCode, exit_code_name, 'other_error') elif exc_name == 'ruffus.ruffus_exceptions.MissingInputFileError': log.error(cleanup_ruffus_error_message(exc_value)) exit_code = ExitCode.input_file @@ -598,7 +598,7 @@ def do_ruffus_exception(ruffus_five_tuple, options, log): (exc_name == 'ocrmypdf.exceptions.EncryptedPdfError'): log.error(textwrap.dedent("""\ Input PDF is encrypted. The encryption must be removed to - perform OCR. + perform OCR. For information about this PDF's security use qpdf --show-encryption infilename @@ -607,7 +607,7 @@ def do_ruffus_exception(ruffus_five_tuple, options, log): qpdf --decrypt [--password=[password]] infilename """)) - exit_code = ExitCode.encrypted_pdf + exit_code = ExitCode.encrypted_pdf elif exc_name == 'ocrmypdf.exceptions.PdfMergeFailedError': log.error(textwrap.dedent("""\ Failed to merge PDF image layer with OCR layer @@ -638,33 +638,31 @@ def do_ruffus_exception(ruffus_five_tuple, options, log): return ExitCode.other_error -def traverse_ruffus_exception(e_args, options, log): - """Walk through a RethrownJobError and find the first exception. +def traverse_ruffus_exception(exceptions, options, log): + """Traverse a RethrownJobError and output the exceptions - Ruffus flattens exception to 5 element tuples. Because of a bug - in <= 2.6.3 it may present either the single: - (task, job, exc, value, stack) - or something like: - [[(task, job, exc, value, stack)]] - - Generally cross-process exception marshalling doesn't work well - and ruffus doesn't support because BaseException has its own - implementation of __reduce__ that attempts to reconstruct the - exception based on e.__init__(e.args). - - Attempting to log the exception directly marshalls it to the logger - which is probably in another process, so it's better to log only - data from the exception at this point. + Ruffus presents exceptions as 5 element tuples. The RethrownJobException + has a list of exceptions like + e.job_exceptions = [(5-tuple), (5-tuple), ...] + + ruffus < 2.7.0 had a bug with exception marshalling that would give + different output whether the main or child process raised the exception. + We no longer support this. + + Attempting to log the exception itself will re-marshall it to the logger + which is normally running in another process. It's better to avoid re- + marshalling. The exit code will be based on this, even if multiple exceptions occurred at the same time.""" - if isinstance(e_args, Sequence) and isinstance(e_args[0], str) and \ - len(e_args) == 5: - return do_ruffus_exception(e_args, options, log) - elif is_iterable_notstr(e_args): - for exc in e_args: - return traverse_ruffus_exception(exc, options, log) + exit_codes = [] + for exc in exceptions: + exit_code = do_ruffus_exception(exceptions, options, log) + exit_codes.append(exit_code) + + return exit_codes[0] # Multiple codes are rare so take the first one + def check_closed_streams(options): @@ -749,7 +747,7 @@ def check_environ(options, _log): for k in old_envvars: if k in os.environ: _log.warning(textwrap.dedent("""\ - OCRmyPDF no longer uses the environment variable {}. + OCRmyPDF no longer uses the environment variable {}. Change PATH to select alternate programs.""".format(k))) @@ -792,14 +790,14 @@ def report_output_file_size(options, _log, input_file, output_file): ratio = output_size / input_size if ratio < 1.35 or input_size < 25000: return # Seems fine - + reasons = [] if not fitz: reasons.append("The optional dependency PyMuPDF is not installed.") image_preproc = { - 'deskew', - 'clean_final', - 'remove_background', + 'deskew', + 'clean_final', + 'remove_background', 'oversample', 'force_ocr' } @@ -886,7 +884,8 @@ def run_pipeline(): except ruffus_exceptions.RethrownJobError as e: if options.verbose: _log.debug(str(e)) # stringify exception so logger doesn't have to - exitcode = traverse_ruffus_exception(e.args, options, _log) + exceptions = e.job_exceptions + exitcode = traverse_ruffus_exception(exceptions, options, _log) if exitcode is None: _log.error("Unexpected ruffus exception: " + str(e)) _log.error(repr(e)) @@ -919,7 +918,7 @@ def run_pipeline(): _log.warning('Output file: The generated PDF is INVALID') return ExitCode.invalid_output_pdf - report_output_file_size(options, _log, start_input_file, + report_output_file_size(options, _log, start_input_file, options.output_file) pdfinfo = context.get_pdfinfo()