From 10416f847f20968af2e542de6512da2fa3baab5f Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 16 Dec 2015 17:36:11 -0800 Subject: [PATCH] Migrate tesseract-hocr code to tesseract module, because modularity --- ocrmypdf/main.py | 66 +++++-------------------- ocrmypdf/tesseract.py | 111 +++++++++++++++++++++++++++++++++--------- 2 files changed, 101 insertions(+), 76 deletions(-) diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index 7bc4d6ff..6ce98907 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -15,6 +15,8 @@ import textwrap import PyPDF2 as pypdf from PIL import Image +from functools import partial + from subprocess import Popen, check_call, PIPE, CalledProcessError, \ TimeoutExpired, check_output, STDOUT try: @@ -545,60 +547,16 @@ def ocr_tesseract_hocr( pdfinfo, pdfinfo_lock): - pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock) - - badxml = os.path.splitext(output_file)[0] + '.badxml' - - args_tesseract = [ - 'tesseract', - '-l', '+'.join(options.language), - input_file, - badxml, - 'hocr' - ] + options.tesseract_config - p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE, - universal_newlines=True) - try: - stdout, stderr = p.communicate(timeout=options.tesseract_timeout) - except TimeoutExpired: - p.kill() - stdout, stderr = p.communicate() - # Generate a HOCR file with no recognized text if tesseract times out - # Temporary workaround to hocrTransform not being able to function if - # it does not have a valid hOCR file. - with open(output_file, 'w', encoding="utf-8") as f: - f.write(tesseract.HOCR_TEMPLATE.format( - pageinfo['width_pixels'], - pageinfo['height_pixels'])) - else: - if stdout: - log.info(stdout) - if stderr: - log.error(stderr) - - if p.returncode != 0: - raise CalledProcessError(p.returncode, args_tesseract) - - if os.path.exists(badxml + '.html'): - # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html) - shutil.move(badxml + '.html', badxml) - elif os.path.exists(badxml + '.hocr'): - # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr) - shutil.move(badxml + '.hocr', badxml) - - # Tesseract 3.03 inserts source filename into hocr file without - # escaping it, creating invalid XML and breaking the parser. - # As a workaround, rewrite the hocr file, replacing the filename - # with a space. Don't know if Tesseract 3.02 does the same. - - regex_nested_single_quotes = re.compile( - r"""title='image "([^"]*)";""") - with open(badxml, mode='r', encoding='utf-8') as f_in, \ - open(output_file, mode='w', encoding='utf-8') as f_out: - for line in f_in: - line = regex_nested_single_quotes.sub( - r"""title='image " ";""", line) - f_out.write(line) + tesseract.generate_hocr( + input_file=input_file, + output_hocr=output_file, + language=options.language, + tessconfig=options.tesseract_config, + timeout=options.tesseract_timeout, + pageinfo_getter=partial(get_pageinfo, input_file, pdfinfo, + pdfinfo_lock), + log=log + ) @active_if(options.pdf_renderer == 'hocr') diff --git a/ocrmypdf/tesseract.py b/ocrmypdf/tesseract.py index dfeac879..1505476a 100644 --- a/ocrmypdf/tesseract.py +++ b/ocrmypdf/tesseract.py @@ -1,13 +1,43 @@ #!/usr/bin/env python3 # © 2015 James R. Barlow: github.com/jbarlow83 -from subprocess import STDOUT, CalledProcessError, check_output import sys import os import re +import shutil from functools import lru_cache from . import ExitCode +from subprocess import Popen, PIPE, CalledProcessError, \ + TimeoutExpired, check_output, STDOUT +try: + from subprocess import DEVNULL +except ImportError: + DEVNULL = open(os.devnull, 'wb') + + +HOCR_TEMPLATE = ''' + + + + + + + + + +
+
+

+ + +

+
+
+ +''' + @lru_cache(maxsize=1) def version(): @@ -46,24 +76,61 @@ def languages(): return set(lang.strip() for lang in langs.splitlines()[1:]) -HOCR_TEMPLATE = ''' - - - - - - - - - -
-
-

- - -

-
-
- -''' +def generate_hocr(input_file, output_hocr, language: list, tessconfig: list, + timeout: float, pageinfo_getter, log): + + badxml = os.path.splitext(output_hocr)[0] + '.badxml' + + args_tesseract = [ + 'tesseract', + '-l', '+'.join(language), + input_file, + badxml, + 'hocr' + ] + tessconfig + p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE, + universal_newlines=True) + try: + stdout, stderr = p.communicate(timeout=timeout) + except TimeoutExpired: + p.kill() + stdout, stderr = p.communicate() + # Generate a HOCR file with no recognized text if tesseract times out + # Temporary workaround to hocrTransform not being able to function if + # it does not have a valid hOCR file. + with open(output_hocr, 'w', encoding="utf-8") as f: + pageinfo = pageinfo_getter() + f.write(HOCR_TEMPLATE.format( + pageinfo['width_pixels'], + pageinfo['height_pixels'])) + else: + if stdout: + log.info(stdout) + if stderr: + log.error(stderr) + + if p.returncode != 0: + raise CalledProcessError(p.returncode, args_tesseract) + + if os.path.exists(badxml + '.html'): + # Tesseract 3.02 appends suffix ".html" on its own (.badxml.html) + shutil.move(badxml + '.html', badxml) + elif os.path.exists(badxml + '.hocr'): + # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr) + shutil.move(badxml + '.hocr', badxml) + + # Tesseract 3.03 inserts source filename into hocr file without + # escaping it, creating invalid XML and breaking the parser. + # As a workaround, rewrite the hocr file, replacing the filename + # with a space. Don't know if Tesseract 3.02 does the same. + + regex_nested_single_quotes = re.compile( + r"""title='image "([^"]*)";""") + with open(badxml, mode='r', encoding='utf-8') as f_in, \ + open(output_hocr, mode='w', encoding='utf-8') as f_out: + for line in f_in: + line = regex_nested_single_quotes.sub( + r"""title='image " ";""", line) + f_out.write(line) + +