From c8a4cbcf17389a0a10e261c6d9e66b8dc3aeeb48 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 11 May 2017 00:17:24 -0700 Subject: [PATCH] Fix test suite breakage after sidecar feature added Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files. --- ocrmypdf/exec/tesseract.py | 35 ++++++++++++++++++++-------------- ocrmypdf/pipeline.py | 20 +++++++++---------- tests/spoof/tesseract_cache.py | 18 +++++++++++++++-- tests/spoof/tesseract_noop.py | 12 ++++++++---- 4 files changed, 55 insertions(+), 30 deletions(-) diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py index c2835a68..34756faf 100644 --- a/ocrmypdf/exec/tesseract.py +++ b/ocrmypdf/exec/tesseract.py @@ -220,18 +220,20 @@ def generate_hocr(input_file, output_files, language: list, engine_mode, output_hocr = next(o for o in output_files if o.endswith('.hocr')) output_sidecar = next(o for o in output_files if o.endswith('.txt')) - badxml = os.path.splitext(output_hocr)[0] + '.badxml' + prefix = os.path.splitext(output_hocr)[0] args_tesseract = tess_base_args(language, engine_mode) if pagesegmode is not None: args_tesseract.extend([psm(), str(pagesegmode)]) + # Reminder: test suite tesseract spoofers will break after any changes + # to the number of order parameters here args_tesseract.extend([ input_file, - badxml, - 'hocr', - 'txt' + prefix, + 'txt', + 'hocr' ] + tessconfig) try: log.debug(args_tesseract) @@ -256,15 +258,17 @@ def generate_hocr(input_file, output_files, language: list, engine_mode, else: tesseract_log_output(log, stdout, input_file) - if os.path.exists(badxml + '.html'): - # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html) - shutil.move(badxml + '.html', badxml) - elif os.path.exists(badxml + '.hocr'): - # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr) - shutil.move(badxml + '.hocr', badxml) + # Tesseract 3.02 appends suffix ".html" instead of ".hocr". For + # consistency rename its output to .hocr + if os.path.exists(prefix + '.html'): + shutil.move(prefix + '.html', prefix + '.tmp') + elif os.path.exists(prefix + '.hocr'): + shutil.move(prefix + '.hocr', prefix + '.tmp') - if os.path.exists(badxml + '.txt'): - shutil.move(badxml + '.txt', output_sidecar) + # The sidecar text file will get the suffix .txt; rename it to + # whatever caller wants it named + if os.path.exists(prefix + '.txt'): + shutil.move(prefix + '.txt', output_sidecar) # Tesseract 3.03 inserts source filename into hocr file without # escaping it, creating invalid XML and breaking the parser. @@ -273,7 +277,7 @@ def generate_hocr(input_file, output_files, language: list, engine_mode, regex_nested_single_quotes = re.compile( r"""title='image "([^"]*)";""") - with open(badxml, mode='r', encoding='utf-8') as f_in, \ + with open(prefix + '.tmp', mode='r', encoding='utf-8') as f_in, \ open(output_hocr, mode='w', encoding='utf-8') as f_out: for line in f_in: line = regex_nested_single_quotes.sub( @@ -329,10 +333,13 @@ def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text, prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes + # Reminder: test suite tesseract spoofers will break after any changes + # to the number of order parameters here args_tesseract.extend([ input_image, prefix, - 'pdf', 'txt' + 'txt', + 'pdf', ] + tessconfig) try: diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index 9fd314a0..615c5960 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -593,14 +593,22 @@ def render_hocr_page( showBoundingboxes=False, invisibleText=True) +def flatten_groups(groups): + for obj in groups: + if is_iterable_notstr(obj): + yield from obj + else: + yield obj + + def render_hocr_debug_page( infiles, output_file, log, context): options = context.get_options() - hocr = next(ii for ii in infiles if ii.endswith('.hocr')) - image = next(ii for ii in infiles if ii.endswith('.image')) + hocr = next(ii for ii in flatten_groups(infiles) if ii.endswith('.hocr')) + image = next(ii for ii in flatten_groups(infiles) if ii.endswith('.image')) pageinfo = get_pageinfo(image, context) dpi = get_page_square_dpi(pageinfo, options) @@ -610,14 +618,6 @@ def render_hocr_debug_page( showBoundingboxes=True, invisibleText=False) -def flatten_groups(groups): - for obj in groups: - if is_iterable_notstr(obj): - yield from obj - else: - yield obj - - def combine_layers( infiles, output_file, diff --git a/tests/spoof/tesseract_cache.py b/tests/spoof/tesseract_cache.py index 86b14a28..6f19701a 100755 --- a/tests/spoof/tesseract_cache.py +++ b/tests/spoof/tesseract_cache.py @@ -36,6 +36,10 @@ def real_tesseract(): def main(): operation = sys.argv[-1] + sidecar = False + if sys.argv[-2] == 'txt': + sidecar = True + # For anything unexpected operation, defer to real tesseract binary # Currently this includes all use of "--tesseract-config" if operation != 'hocr' and operation != 'pdf' and operation != 'stdout': @@ -92,16 +96,22 @@ def main(): return if operation == 'stdout': + # tesseract [--options] ... input stdout input_file = sys.argv[-2] output_file = 'stdout' + sidecar_file = '' else: - input_file = sys.argv[-3] - output_file = sys.argv[-2] + # tesseract [--options] ... input output txt hocr|pdf + input_file = sys.argv[-4] + output_file = sys.argv[-3] + sidecar_file = sys.argv[-3] if operation == 'hocr': output_file += '.hocr' + sidecar_file += '.txt' elif operation == 'pdf': output_file += '.pdf' + sidecar_file += '.txt' with open(input_file, 'rb') as f: m.update(f.read()) @@ -112,6 +122,8 @@ def main(): print("Tesseract cache hit", file=sys.stderr) if operation != 'stdout': shutil.copy(cache_name, output_file) + if sidecar: + shutil.copy(cache_name + '.sidecar', sidecar_file) # Replicate output with open(cache_name + '.stdout', 'rb') as f: @@ -149,6 +161,8 @@ def main(): shutil.copy(output_file, cache_name) else: print("Could not find output file", file=sys.stderr) + if sidecar and os.path.exists(sidecar_file): + shutil.copy(sidecar_file, cache_name + '.sidecar') else: open(cache_name, 'w').close() diff --git a/tests/spoof/tesseract_noop.py b/tests/spoof/tesseract_noop.py index 8441046d..92f03489 100755 --- a/tests/spoof/tesseract_noop.py +++ b/tests/spoof/tesseract_noop.py @@ -53,18 +53,22 @@ def main(): print('List of available languages (1):\neng', file=sys.stderr) sys.exit(0) elif sys.argv[-1] == 'hocr': - inputf = sys.argv[-3] - output = sys.argv[-2] + inputf = sys.argv[-4] + output = sys.argv[-3] with Image.open(inputf) as im, \ open(output + '.hocr', 'w', encoding='utf-8') as f: w, h = im.size f.write(HOCR_TEMPLATE.format(str(w), str(h))) + with open(output + '.txt', 'w') as f: + f.write('') elif sys.argv[-1] == 'pdf': - inputf = sys.argv[-3] - output = sys.argv[-2] + inputf = sys.argv[-4] + output = sys.argv[-3] pdf_bytes = img2pdf.convert([inputf], dpi=300) with open(output + '.pdf', 'wb') as f: f.write(pdf_bytes) + with open(output + '.txt', 'w') as f: + f.write('') elif sys.argv[-1] == 'stdout': inputf = sys.argv[-2] print("""Orientation: 0