Migrate tesseract-hocr code to tesseract module, because modularity

This commit is contained in:
James R. Barlow
2015-12-16 17:36:11 -08:00
parent 79b3472b26
commit 10416f847f
2 changed files with 101 additions and 76 deletions

View File

@@ -15,6 +15,8 @@ import textwrap
import PyPDF2 as pypdf
from PIL import Image
from functools import partial
from subprocess import Popen, check_call, PIPE, CalledProcessError, \
TimeoutExpired, check_output, STDOUT
try:
@@ -545,60 +547,16 @@ def ocr_tesseract_hocr(
pdfinfo,
pdfinfo_lock):
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
badxml = os.path.splitext(output_file)[0] + '.badxml'
args_tesseract = [
'tesseract',
'-l', '+'.join(options.language),
input_file,
badxml,
'hocr'
] + options.tesseract_config
p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
universal_newlines=True)
try:
stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
except TimeoutExpired:
p.kill()
stdout, stderr = p.communicate()
# Generate a HOCR file with no recognized text if tesseract times out
# Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file.
with open(output_file, 'w', encoding="utf-8") as f:
f.write(tesseract.HOCR_TEMPLATE.format(
pageinfo['width_pixels'],
pageinfo['height_pixels']))
else:
if stdout:
log.info(stdout)
if stderr:
log.error(stderr)
if p.returncode != 0:
raise CalledProcessError(p.returncode, args_tesseract)
if os.path.exists(badxml + '.html'):
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
shutil.move(badxml + '.html', badxml)
elif os.path.exists(badxml + '.hocr'):
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
shutil.move(badxml + '.hocr', badxml)
# Tesseract 3.03 inserts source filename into hocr file without
# escaping it, creating invalid XML and breaking the parser.
# As a workaround, rewrite the hocr file, replacing the filename
# with a space. Don't know if Tesseract 3.02 does the same.
regex_nested_single_quotes = re.compile(
r"""title='image "([^"]*)";""")
with open(badxml, mode='r', encoding='utf-8') as f_in, \
open(output_file, mode='w', encoding='utf-8') as f_out:
for line in f_in:
line = regex_nested_single_quotes.sub(
r"""title='image " ";""", line)
f_out.write(line)
tesseract.generate_hocr(
input_file=input_file,
output_hocr=output_file,
language=options.language,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pageinfo_getter=partial(get_pageinfo, input_file, pdfinfo,
pdfinfo_lock),
log=log
)
@active_if(options.pdf_renderer == 'hocr')

View File

@@ -1,13 +1,43 @@
#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83
from subprocess import STDOUT, CalledProcessError, check_output
import sys
import os
import re
import shutil
from functools import lru_cache
from . import ExitCode
from subprocess import Popen, PIPE, CalledProcessError, \
TimeoutExpired, check_output, STDOUT
try:
from subprocess import DEVNULL
except ImportError:
DEVNULL = open(os.devnull, 'wb')
HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.02.02' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
<span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
</span>
</p>
</div>
</div>
</body>
</html>'''
@lru_cache(maxsize=1)
def version():
@@ -46,24 +76,61 @@ def languages():
return set(lang.strip() for lang in langs.splitlines()[1:])
HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.02.02' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
<span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
</span>
</p>
</div>
</div>
</body>
</html>'''
def generate_hocr(input_file, output_hocr, language: list, tessconfig: list,
timeout: float, pageinfo_getter, log):
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
args_tesseract = [
'tesseract',
'-l', '+'.join(language),
input_file,
badxml,
'hocr'
] + tessconfig
p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
universal_newlines=True)
try:
stdout, stderr = p.communicate(timeout=timeout)
except TimeoutExpired:
p.kill()
stdout, stderr = p.communicate()
# Generate a HOCR file with no recognized text if tesseract times out
# Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file.
with open(output_hocr, 'w', encoding="utf-8") as f:
pageinfo = pageinfo_getter()
f.write(HOCR_TEMPLATE.format(
pageinfo['width_pixels'],
pageinfo['height_pixels']))
else:
if stdout:
log.info(stdout)
if stderr:
log.error(stderr)
if p.returncode != 0:
raise CalledProcessError(p.returncode, args_tesseract)
if os.path.exists(badxml + '.html'):
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
shutil.move(badxml + '.html', badxml)
elif os.path.exists(badxml + '.hocr'):
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
shutil.move(badxml + '.hocr', badxml)
# Tesseract 3.03 inserts source filename into hocr file without
# escaping it, creating invalid XML and breaking the parser.
# As a workaround, rewrite the hocr file, replacing the filename
# with a space. Don't know if Tesseract 3.02 does the same.
regex_nested_single_quotes = re.compile(
r"""title='image "([^"]*)";""")
with open(badxml, mode='r', encoding='utf-8') as f_in, \
open(output_hocr, mode='w', encoding='utf-8') as f_out:
for line in f_in:
line = regex_nested_single_quotes.sub(
r"""title='image " ";""", line)
f_out.write(line)