Fix test suite breakage after sidecar feature added

Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files.
2026-05-18 19:47:48 -04:00 · 2017-05-11 00:17:24 -07:00
parent 16b6442b23
commit c8a4cbcf17
4 changed files with 55 additions and 30 deletions
--- a/ocrmypdf/exec/tesseract.py
+++ b/ocrmypdf/exec/tesseract.py
@@ -220,18 +220,20 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,

    output_hocr = next(o for o in output_files if o.endswith('.hocr'))
    output_sidecar = next(o for o in output_files if o.endswith('.txt'))
-    badxml = os.path.splitext(output_hocr)[0] + '.badxml'
+    prefix = os.path.splitext(output_hocr)[0]

    args_tesseract = tess_base_args(language, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend([psm(), str(pagesegmode)])

+    # Reminder: test suite tesseract spoofers will break after any changes
+    # to the number of order parameters here
    args_tesseract.extend([
        input_file,
-        badxml,
-        'hocr',
-        'txt'
+        prefix,
+        'txt',
+        'hocr'
    ] + tessconfig)
    try:
        log.debug(args_tesseract)
@@ -256,15 +258,17 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
    else:
        tesseract_log_output(log, stdout, input_file)

-        if os.path.exists(badxml + '.html'):
-            # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
-            shutil.move(badxml + '.html', badxml)
-        elif os.path.exists(badxml + '.hocr'):
-            # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
-            shutil.move(badxml + '.hocr', badxml)
+        # Tesseract 3.02 appends suffix ".html" instead of ".hocr". For
+        # consistency rename its output to .hocr
+        if os.path.exists(prefix + '.html'):
+            shutil.move(prefix + '.html', prefix + '.tmp')
+        elif os.path.exists(prefix + '.hocr'):
+            shutil.move(prefix + '.hocr', prefix + '.tmp')

-        if os.path.exists(badxml + '.txt'):
-            shutil.move(badxml + '.txt', output_sidecar)
+        # The sidecar text file will get the suffix .txt; rename it to
+        # whatever caller wants it named
+        if os.path.exists(prefix + '.txt'):
+            shutil.move(prefix + '.txt', output_sidecar)

        # Tesseract 3.03 inserts source filename into hocr file without
        # escaping it, creating invalid XML and breaking the parser.
@@ -273,7 +277,7 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,

        regex_nested_single_quotes = re.compile(
            r"""title='image "([^"]*)";""")
-        with open(badxml, mode='r', encoding='utf-8') as f_in, \
+        with open(prefix + '.tmp', mode='r', encoding='utf-8') as f_in, \
                open(output_hocr, mode='w', encoding='utf-8') as f_out:
            for line in f_in:
                line = regex_nested_single_quotes.sub(
@@ -329,10 +333,13 @@ def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,

    prefix = os.path.splitext(output_pdf)[0]  # Tesseract appends suffixes

+    # Reminder: test suite tesseract spoofers will break after any changes
+    # to the number of order parameters here
    args_tesseract.extend([
        input_image,
        prefix,
-        'pdf', 'txt'
+        'txt',
+        'pdf',
    ] + tessconfig)

    try:
--- a/ocrmypdf/pipeline.py
+++ b/ocrmypdf/pipeline.py
@@ -593,14 +593,22 @@ def render_hocr_page(
                         showBoundingboxes=False, invisibleText=True)


+def flatten_groups(groups):
+    for obj in groups:
+        if is_iterable_notstr(obj):
+            yield from obj
+        else:
+            yield obj
+
+
 def render_hocr_debug_page(
        infiles,
        output_file,
        log,
        context):
    options = context.get_options()
-    hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
-    image = next(ii for ii in infiles if ii.endswith('.image'))
+    hocr = next(ii for ii in flatten_groups(infiles) if ii.endswith('.hocr'))
+    image = next(ii for ii in flatten_groups(infiles) if ii.endswith('.image'))

    pageinfo = get_pageinfo(image, context)
    dpi = get_page_square_dpi(pageinfo, options)
@@ -610,14 +618,6 @@ def render_hocr_debug_page(
                         showBoundingboxes=True, invisibleText=False)


-def flatten_groups(groups):
-    for obj in groups:
-        if is_iterable_notstr(obj):
-            yield from obj
-        else:
-            yield obj
-
-
 def combine_layers(
        infiles,
        output_file,
--- a/tests/spoof/tesseract_cache.py
+++ b/tests/spoof/tesseract_cache.py
@@ -36,6 +36,10 @@ def real_tesseract():

 def main():
    operation = sys.argv[-1]
+    sidecar = False
+    if sys.argv[-2] == 'txt':
+        sidecar = True
+
    # For anything unexpected operation, defer to real tesseract binary
    # Currently this includes all use of "--tesseract-config"
    if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
@@ -92,16 +96,22 @@ def main():
        return

    if operation == 'stdout':
+        # tesseract [--options] ... input stdout
        input_file = sys.argv[-2]
        output_file = 'stdout'
+        sidecar_file = ''
    else:
-        input_file = sys.argv[-3]
-        output_file = sys.argv[-2]
+        # tesseract [--options] ... input output txt hocr|pdf
+        input_file = sys.argv[-4]
+        output_file = sys.argv[-3]
+        sidecar_file = sys.argv[-3]

    if operation == 'hocr':
        output_file += '.hocr'
+        sidecar_file += '.txt'
    elif operation == 'pdf':
        output_file += '.pdf'
+        sidecar_file += '.txt'

    with open(input_file, 'rb') as f:
        m.update(f.read())
@@ -112,6 +122,8 @@ def main():
        print("Tesseract cache hit", file=sys.stderr)
        if operation != 'stdout':
            shutil.copy(cache_name, output_file)
+            if sidecar:
+                shutil.copy(cache_name + '.sidecar', sidecar_file)

        # Replicate output
        with open(cache_name + '.stdout', 'rb') as f:
@@ -149,6 +161,8 @@ def main():
            shutil.copy(output_file, cache_name)
        else:
            print("Could not find output file", file=sys.stderr)
+        if sidecar and os.path.exists(sidecar_file):
+            shutil.copy(sidecar_file, cache_name + '.sidecar')
    else:
        open(cache_name, 'w').close()

--- a/tests/spoof/tesseract_noop.py
+++ b/tests/spoof/tesseract_noop.py
@@ -53,18 +53,22 @@ def main():
        print('List of available languages (1):\neng', file=sys.stderr)
        sys.exit(0)
    elif sys.argv[-1] == 'hocr':
-        inputf = sys.argv[-3]
-        output = sys.argv[-2]
+        inputf = sys.argv[-4]
+        output = sys.argv[-3]
        with Image.open(inputf) as im, \
                open(output + '.hocr', 'w', encoding='utf-8') as f:
            w, h = im.size
            f.write(HOCR_TEMPLATE.format(str(w), str(h)))
+        with open(output + '.txt', 'w') as f:
+            f.write('')
    elif sys.argv[-1] == 'pdf':
-        inputf = sys.argv[-3]
-        output = sys.argv[-2]
+        inputf = sys.argv[-4]
+        output = sys.argv[-3]
        pdf_bytes = img2pdf.convert([inputf], dpi=300)
        with open(output + '.pdf', 'wb') as f:
            f.write(pdf_bytes)
+        with open(output + '.txt', 'w') as f:
+            f.write('')
    elif sys.argv[-1] == 'stdout':
        inputf = sys.argv[-2]
        print("""Orientation: 0