mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
Migrate tesseract-hocr code to tesseract module, because modularity
This commit is contained in:
@@ -15,6 +15,8 @@ import textwrap
|
||||
import PyPDF2 as pypdf
|
||||
from PIL import Image
|
||||
|
||||
from functools import partial
|
||||
|
||||
from subprocess import Popen, check_call, PIPE, CalledProcessError, \
|
||||
TimeoutExpired, check_output, STDOUT
|
||||
try:
|
||||
@@ -545,60 +547,16 @@ def ocr_tesseract_hocr(
|
||||
pdfinfo,
|
||||
pdfinfo_lock):
|
||||
|
||||
pageinfo = get_pageinfo(input_file, pdfinfo, pdfinfo_lock)
|
||||
|
||||
badxml = os.path.splitext(output_file)[0] + '.badxml'
|
||||
|
||||
args_tesseract = [
|
||||
'tesseract',
|
||||
'-l', '+'.join(options.language),
|
||||
input_file,
|
||||
badxml,
|
||||
'hocr'
|
||||
] + options.tesseract_config
|
||||
p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
|
||||
universal_newlines=True)
|
||||
try:
|
||||
stdout, stderr = p.communicate(timeout=options.tesseract_timeout)
|
||||
except TimeoutExpired:
|
||||
p.kill()
|
||||
stdout, stderr = p.communicate()
|
||||
# Generate a HOCR file with no recognized text if tesseract times out
|
||||
# Temporary workaround to hocrTransform not being able to function if
|
||||
# it does not have a valid hOCR file.
|
||||
with open(output_file, 'w', encoding="utf-8") as f:
|
||||
f.write(tesseract.HOCR_TEMPLATE.format(
|
||||
pageinfo['width_pixels'],
|
||||
pageinfo['height_pixels']))
|
||||
else:
|
||||
if stdout:
|
||||
log.info(stdout)
|
||||
if stderr:
|
||||
log.error(stderr)
|
||||
|
||||
if p.returncode != 0:
|
||||
raise CalledProcessError(p.returncode, args_tesseract)
|
||||
|
||||
if os.path.exists(badxml + '.html'):
|
||||
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
|
||||
shutil.move(badxml + '.html', badxml)
|
||||
elif os.path.exists(badxml + '.hocr'):
|
||||
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
|
||||
shutil.move(badxml + '.hocr', badxml)
|
||||
|
||||
# Tesseract 3.03 inserts source filename into hocr file without
|
||||
# escaping it, creating invalid XML and breaking the parser.
|
||||
# As a workaround, rewrite the hocr file, replacing the filename
|
||||
# with a space. Don't know if Tesseract 3.02 does the same.
|
||||
|
||||
regex_nested_single_quotes = re.compile(
|
||||
r"""title='image "([^"]*)";""")
|
||||
with open(badxml, mode='r', encoding='utf-8') as f_in, \
|
||||
open(output_file, mode='w', encoding='utf-8') as f_out:
|
||||
for line in f_in:
|
||||
line = regex_nested_single_quotes.sub(
|
||||
r"""title='image " ";""", line)
|
||||
f_out.write(line)
|
||||
tesseract.generate_hocr(
|
||||
input_file=input_file,
|
||||
output_hocr=output_file,
|
||||
language=options.language,
|
||||
tessconfig=options.tesseract_config,
|
||||
timeout=options.tesseract_timeout,
|
||||
pageinfo_getter=partial(get_pageinfo, input_file, pdfinfo,
|
||||
pdfinfo_lock),
|
||||
log=log
|
||||
)
|
||||
|
||||
|
||||
@active_if(options.pdf_renderer == 'hocr')
|
||||
|
||||
@@ -1,13 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
# © 2015 James R. Barlow: github.com/jbarlow83
|
||||
|
||||
from subprocess import STDOUT, CalledProcessError, check_output
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from functools import lru_cache
|
||||
from . import ExitCode
|
||||
|
||||
from subprocess import Popen, PIPE, CalledProcessError, \
|
||||
TimeoutExpired, check_output, STDOUT
|
||||
try:
|
||||
from subprocess import DEVNULL
|
||||
except ImportError:
|
||||
DEVNULL = open(os.devnull, 'wb')
|
||||
|
||||
|
||||
HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 3.02.02' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
||||
</head>
|
||||
<body>
|
||||
<div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
|
||||
<div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
|
||||
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
|
||||
<span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
|
||||
</span>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def version():
|
||||
@@ -46,24 +76,61 @@ def languages():
|
||||
return set(lang.strip() for lang in langs.splitlines()[1:])
|
||||
|
||||
|
||||
HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 3.02.02' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
||||
</head>
|
||||
<body>
|
||||
<div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
|
||||
<div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
|
||||
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
|
||||
<span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}"><span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
|
||||
</span>
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>'''
|
||||
def generate_hocr(input_file, output_hocr, language: list, tessconfig: list,
|
||||
timeout: float, pageinfo_getter, log):
|
||||
|
||||
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
|
||||
|
||||
args_tesseract = [
|
||||
'tesseract',
|
||||
'-l', '+'.join(language),
|
||||
input_file,
|
||||
badxml,
|
||||
'hocr'
|
||||
] + tessconfig
|
||||
p = Popen(args_tesseract, close_fds=True, stdout=PIPE, stderr=PIPE,
|
||||
universal_newlines=True)
|
||||
try:
|
||||
stdout, stderr = p.communicate(timeout=timeout)
|
||||
except TimeoutExpired:
|
||||
p.kill()
|
||||
stdout, stderr = p.communicate()
|
||||
# Generate a HOCR file with no recognized text if tesseract times out
|
||||
# Temporary workaround to hocrTransform not being able to function if
|
||||
# it does not have a valid hOCR file.
|
||||
with open(output_hocr, 'w', encoding="utf-8") as f:
|
||||
pageinfo = pageinfo_getter()
|
||||
f.write(HOCR_TEMPLATE.format(
|
||||
pageinfo['width_pixels'],
|
||||
pageinfo['height_pixels']))
|
||||
else:
|
||||
if stdout:
|
||||
log.info(stdout)
|
||||
if stderr:
|
||||
log.error(stderr)
|
||||
|
||||
if p.returncode != 0:
|
||||
raise CalledProcessError(p.returncode, args_tesseract)
|
||||
|
||||
if os.path.exists(badxml + '.html'):
|
||||
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
|
||||
shutil.move(badxml + '.html', badxml)
|
||||
elif os.path.exists(badxml + '.hocr'):
|
||||
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
|
||||
shutil.move(badxml + '.hocr', badxml)
|
||||
|
||||
# Tesseract 3.03 inserts source filename into hocr file without
|
||||
# escaping it, creating invalid XML and breaking the parser.
|
||||
# As a workaround, rewrite the hocr file, replacing the filename
|
||||
# with a space. Don't know if Tesseract 3.02 does the same.
|
||||
|
||||
regex_nested_single_quotes = re.compile(
|
||||
r"""title='image "([^"]*)";""")
|
||||
with open(badxml, mode='r', encoding='utf-8') as f_in, \
|
||||
open(output_hocr, mode='w', encoding='utf-8') as f_out:
|
||||
for line in f_in:
|
||||
line = regex_nested_single_quotes.sub(
|
||||
r"""title='image " ";""", line)
|
||||
f_out.write(line)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user