Fix test suite breakage after sidecar feature added

Forgot to update tesseract spoofers to account for change in tesseract
parameters.  Also the change to outputting multiple files in the collate
steps affected how ruffus passes information into downstream consumers
of those files.
This commit is contained in:
James R. Barlow
2017-05-11 00:17:24 -07:00
parent 16b6442b23
commit c8a4cbcf17
4 changed files with 55 additions and 30 deletions

View File

@@ -220,18 +220,20 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
output_hocr = next(o for o in output_files if o.endswith('.hocr'))
output_sidecar = next(o for o in output_files if o.endswith('.txt'))
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
prefix = os.path.splitext(output_hocr)[0]
args_tesseract = tess_base_args(language, engine_mode)
if pagesegmode is not None:
args_tesseract.extend([psm(), str(pagesegmode)])
# Reminder: test suite tesseract spoofers will break after any changes
# to the number of order parameters here
args_tesseract.extend([
input_file,
badxml,
'hocr',
'txt'
prefix,
'txt',
'hocr'
] + tessconfig)
try:
log.debug(args_tesseract)
@@ -256,15 +258,17 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
else:
tesseract_log_output(log, stdout, input_file)
if os.path.exists(badxml + '.html'):
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
shutil.move(badxml + '.html', badxml)
elif os.path.exists(badxml + '.hocr'):
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
shutil.move(badxml + '.hocr', badxml)
# Tesseract 3.02 appends suffix ".html" instead of ".hocr". For
# consistency rename its output to .hocr
if os.path.exists(prefix + '.html'):
shutil.move(prefix + '.html', prefix + '.tmp')
elif os.path.exists(prefix + '.hocr'):
shutil.move(prefix + '.hocr', prefix + '.tmp')
if os.path.exists(badxml + '.txt'):
shutil.move(badxml + '.txt', output_sidecar)
# The sidecar text file will get the suffix .txt; rename it to
# whatever caller wants it named
if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_sidecar)
# Tesseract 3.03 inserts source filename into hocr file without
# escaping it, creating invalid XML and breaking the parser.
@@ -273,7 +277,7 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
regex_nested_single_quotes = re.compile(
r"""title='image "([^"]*)";""")
with open(badxml, mode='r', encoding='utf-8') as f_in, \
with open(prefix + '.tmp', mode='r', encoding='utf-8') as f_in, \
open(output_hocr, mode='w', encoding='utf-8') as f_out:
for line in f_in:
line = regex_nested_single_quotes.sub(
@@ -329,10 +333,13 @@ def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
# Reminder: test suite tesseract spoofers will break after any changes
# to the number of order parameters here
args_tesseract.extend([
input_image,
prefix,
'pdf', 'txt'
'txt',
'pdf',
] + tessconfig)
try:

View File

@@ -593,14 +593,22 @@ def render_hocr_page(
showBoundingboxes=False, invisibleText=True)
def flatten_groups(groups):
for obj in groups:
if is_iterable_notstr(obj):
yield from obj
else:
yield obj
def render_hocr_debug_page(
infiles,
output_file,
log,
context):
options = context.get_options()
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
image = next(ii for ii in infiles if ii.endswith('.image'))
hocr = next(ii for ii in flatten_groups(infiles) if ii.endswith('.hocr'))
image = next(ii for ii in flatten_groups(infiles) if ii.endswith('.image'))
pageinfo = get_pageinfo(image, context)
dpi = get_page_square_dpi(pageinfo, options)
@@ -610,14 +618,6 @@ def render_hocr_debug_page(
showBoundingboxes=True, invisibleText=False)
def flatten_groups(groups):
for obj in groups:
if is_iterable_notstr(obj):
yield from obj
else:
yield obj
def combine_layers(
infiles,
output_file,

View File

@@ -36,6 +36,10 @@ def real_tesseract():
def main():
operation = sys.argv[-1]
sidecar = False
if sys.argv[-2] == 'txt':
sidecar = True
# For anything unexpected operation, defer to real tesseract binary
# Currently this includes all use of "--tesseract-config"
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
@@ -92,16 +96,22 @@ def main():
return
if operation == 'stdout':
# tesseract [--options] ... input stdout
input_file = sys.argv[-2]
output_file = 'stdout'
sidecar_file = ''
else:
input_file = sys.argv[-3]
output_file = sys.argv[-2]
# tesseract [--options] ... input output txt hocr|pdf
input_file = sys.argv[-4]
output_file = sys.argv[-3]
sidecar_file = sys.argv[-3]
if operation == 'hocr':
output_file += '.hocr'
sidecar_file += '.txt'
elif operation == 'pdf':
output_file += '.pdf'
sidecar_file += '.txt'
with open(input_file, 'rb') as f:
m.update(f.read())
@@ -112,6 +122,8 @@ def main():
print("Tesseract cache hit", file=sys.stderr)
if operation != 'stdout':
shutil.copy(cache_name, output_file)
if sidecar:
shutil.copy(cache_name + '.sidecar', sidecar_file)
# Replicate output
with open(cache_name + '.stdout', 'rb') as f:
@@ -149,6 +161,8 @@ def main():
shutil.copy(output_file, cache_name)
else:
print("Could not find output file", file=sys.stderr)
if sidecar and os.path.exists(sidecar_file):
shutil.copy(sidecar_file, cache_name + '.sidecar')
else:
open(cache_name, 'w').close()

View File

@@ -53,18 +53,22 @@ def main():
print('List of available languages (1):\neng', file=sys.stderr)
sys.exit(0)
elif sys.argv[-1] == 'hocr':
inputf = sys.argv[-3]
output = sys.argv[-2]
inputf = sys.argv[-4]
output = sys.argv[-3]
with Image.open(inputf) as im, \
open(output + '.hocr', 'w', encoding='utf-8') as f:
w, h = im.size
f.write(HOCR_TEMPLATE.format(str(w), str(h)))
with open(output + '.txt', 'w') as f:
f.write('')
elif sys.argv[-1] == 'pdf':
inputf = sys.argv[-3]
output = sys.argv[-2]
inputf = sys.argv[-4]
output = sys.argv[-3]
pdf_bytes = img2pdf.convert([inputf], dpi=300)
with open(output + '.pdf', 'wb') as f:
f.write(pdf_bytes)
with open(output + '.txt', 'w') as f:
f.write('')
elif sys.argv[-1] == 'stdout':
inputf = sys.argv[-2]
print("""Orientation: 0