Migrate tesseract-hocr code to tesseract module, because modularity

2026-05-18 19:47:48 -04:00 · 2015-12-16 17:36:11 -08:00
parent 79b3472b26
commit 10416f847f
2 changed files with 101 additions and 76 deletions
--- a/ocrmypdf/main.py
+++ b/ocrmypdf/main.py
@@ -15,6 +15,8 @@ import textwrap
 import PyPDF2 as pypdf
 from PIL import Image

+from functools import partial
+
 from subprocess import Popen, check_call, PIPE, CalledProcessError, \
    TimeoutExpired, check_output, STDOUT
 try:
@@ -545,60 +547,16 @@ def ocr_tesseract_hocr(
        pdfinfo,
        pdfinfo_lock):

-    pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
-
-    badxml = os.path.splitext(output_file)[0] + '.badxml'
-
-    args_tesseract = [
-        'tesseract',
-        '-l', '+'.join(options.language),
-        input_file,
-        badxml,
-        'hocr'
-    ] + options.tesseract_config
-    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
-              universal_newlines=True)
-    try:
-        stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
-    except TimeoutExpired:
-        p.kill()
-        stdout, stderr = p.communicate()
-        # Generate a HOCR file with no recognized text if tesseract times out
-        # Temporary workaround to hocrTransform not being able to function if
-        # it does not have a valid hOCR file.
-        with open(output_file, 'w', encoding="utf-8") as f:
-            f.write(tesseract.HOCR_TEMPLATE.format(
-                pageinfo['width_pixels'],
-                pageinfo['height_pixels']))
-    else:
-        if stdout:
-            log.info(stdout)
-        if stderr:
-            log.error(stderr)
-
-        if p.returncode != 0:
-            raise CalledProcessError(p.returncode, args_tesseract)
-
-        if os.path.exists(badxml + '.html'):
-            # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
-            shutil.move(badxml + '.html', badxml)
-        elif os.path.exists(badxml + '.hocr'):
-            # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
-            shutil.move(badxml + '.hocr', badxml)
-
-        # Tesseract 3.03 inserts source filename into hocr file without
-        # escaping it, creating invalid XML and breaking the parser.
-        # As a workaround, rewrite the hocr file, replacing the filename
-        # with a space.  Don't know if Tesseract 3.02 does the same.
-
-        regex_nested_single_quotes = re.compile(
-            r"""title='image "([^"]*)";""")
-        with open(badxml, mode='r', encoding='utf-8') as f_in, \
-                open(output_file, mode='w', encoding='utf-8') as f_out:
-            for line in f_in:
-                line = regex_nested_single_quotes.sub(
-                    r"""title='image " ";""", line)
-                f_out.write(line)
+    tesseract.generate_hocr(
+        input_file=input_file,
+        output_hocr=output_file,
+        language=options.language,
+        tessconfig=options.tesseract_config,
+        timeout=options.tesseract_timeout,
+        pageinfo_getter=partial(get_pageinfo, input_file, pdfinfo,
+                                pdfinfo_lock),
+        log=log
+        )


@active_if(options.pdf_renderer == 'hocr')
--- a/ocrmypdf/tesseract.py
+++ b/ocrmypdf/tesseract.py
@@ -1,13 +1,43 @@
 #!/usr/bin/env python3
 # © 2015 James R. Barlow: github.com/jbarlow83

-from subprocess import STDOUT, CalledProcessError, check_output
 import sys
 import os
 import re
+import shutil
 from functools import lru_cache
 from . import ExitCode

+from subprocess import Popen, PIPE, CalledProcessError, \
+    TimeoutExpired, check_output, STDOUT
+try:
+    from subprocess import DEVNULL
+except ImportError:
+    DEVNULL = open(os.devnull, 'wb')
+
+
+HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+  <title></title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+  <meta name='ocr-system' content='tesseract 3.02.02' />
+  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
+ </head>
+ <body>
+  <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
+   <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
+    <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
+     <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
+     </span>
+    </p>
+   </div>
+  </div>
+ </body>
+</html>'''
+

@lru_cache(maxsize=1)
 def version():
@@ -46,24 +76,61 @@ def languages():
    return set(lang.strip() for lang in langs.splitlines()[1:])


-HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
-    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
- <head>
-  <title></title>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  <meta name='ocr-system' content='tesseract 3.02.02' />
-  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
- </head>
- <body>
-  <div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
-   <div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
-    <p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
-     <span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
-     </span>
-    </p>
-   </div>
-  </div>
- </body>
-</html>'''
+def generate_hocr(input_file, output_hocr, language: list, tessconfig: list,
+                  timeout: float, pageinfo_getter, log):
+
+    badxml = os.path.splitext(output_hocr)[0] + '.badxml'
+
+    args_tesseract = [
+        'tesseract',
+        '-l', '+'.join(language),
+        input_file,
+        badxml,
+        'hocr'
+    ] + tessconfig
+    p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
+              universal_newlines=True)
+    try:
+        stdout, stderr = p.communicate(timeout=timeout)
+    except TimeoutExpired:
+        p.kill()
+        stdout, stderr = p.communicate()
+        # Generate a HOCR file with no recognized text if tesseract times out
+        # Temporary workaround to hocrTransform not being able to function if
+        # it does not have a valid hOCR file.
+        with open(output_hocr, 'w', encoding="utf-8") as f:
+            pageinfo = pageinfo_getter()
+            f.write(HOCR_TEMPLATE.format(
+                pageinfo['width_pixels'],
+                pageinfo['height_pixels']))
+    else:
+        if stdout:
+            log.info(stdout)
+        if stderr:
+            log.error(stderr)
+
+        if p.returncode != 0:
+            raise CalledProcessError(p.returncode, args_tesseract)
+
+        if os.path.exists(badxml + '.html'):
+            # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
+            shutil.move(badxml + '.html', badxml)
+        elif os.path.exists(badxml + '.hocr'):
+            # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
+            shutil.move(badxml + '.hocr', badxml)
+
+        # Tesseract 3.03 inserts source filename into hocr file without
+        # escaping it, creating invalid XML and breaking the parser.
+        # As a workaround, rewrite the hocr file, replacing the filename
+        # with a space.  Don't know if Tesseract 3.02 does the same.
+
+        regex_nested_single_quotes = re.compile(
+            r"""title='image "([^"]*)";""")
+        with open(badxml, mode='r', encoding='utf-8') as f_in, \
+                open(output_hocr, mode='w', encoding='utf-8') as f_out:
+            for line in f_in:
+                line = regex_nested_single_quotes.sub(
+                    r"""title='image " ";""", line)
+                f_out.write(line)
+
+