mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
Fix test suite breakage after sidecar feature added
Forgot to update tesseract spoofers to account for change in tesseract parameters. Also the change to outputting multiple files in the collate steps affected how ruffus passes information into downstream consumers of those files.
This commit is contained in:
@@ -220,18 +220,20 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
|
||||
|
||||
output_hocr = next(o for o in output_files if o.endswith('.hocr'))
|
||||
output_sidecar = next(o for o in output_files if o.endswith('.txt'))
|
||||
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
|
||||
prefix = os.path.splitext(output_hocr)[0]
|
||||
|
||||
args_tesseract = tess_base_args(language, engine_mode)
|
||||
|
||||
if pagesegmode is not None:
|
||||
args_tesseract.extend([psm(), str(pagesegmode)])
|
||||
|
||||
# Reminder: test suite tesseract spoofers will break after any changes
|
||||
# to the number of order parameters here
|
||||
args_tesseract.extend([
|
||||
input_file,
|
||||
badxml,
|
||||
'hocr',
|
||||
'txt'
|
||||
prefix,
|
||||
'txt',
|
||||
'hocr'
|
||||
] + tessconfig)
|
||||
try:
|
||||
log.debug(args_tesseract)
|
||||
@@ -256,15 +258,17 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
|
||||
else:
|
||||
tesseract_log_output(log, stdout, input_file)
|
||||
|
||||
if os.path.exists(badxml + '.html'):
|
||||
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
|
||||
shutil.move(badxml + '.html', badxml)
|
||||
elif os.path.exists(badxml + '.hocr'):
|
||||
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
|
||||
shutil.move(badxml + '.hocr', badxml)
|
||||
# Tesseract 3.02 appends suffix ".html" instead of ".hocr". For
|
||||
# consistency rename its output to .hocr
|
||||
if os.path.exists(prefix + '.html'):
|
||||
shutil.move(prefix + '.html', prefix + '.tmp')
|
||||
elif os.path.exists(prefix + '.hocr'):
|
||||
shutil.move(prefix + '.hocr', prefix + '.tmp')
|
||||
|
||||
if os.path.exists(badxml + '.txt'):
|
||||
shutil.move(badxml + '.txt', output_sidecar)
|
||||
# The sidecar text file will get the suffix .txt; rename it to
|
||||
# whatever caller wants it named
|
||||
if os.path.exists(prefix + '.txt'):
|
||||
shutil.move(prefix + '.txt', output_sidecar)
|
||||
|
||||
# Tesseract 3.03 inserts source filename into hocr file without
|
||||
# escaping it, creating invalid XML and breaking the parser.
|
||||
@@ -273,7 +277,7 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
|
||||
|
||||
regex_nested_single_quotes = re.compile(
|
||||
r"""title='image "([^"]*)";""")
|
||||
with open(badxml, mode='r', encoding='utf-8') as f_in, \
|
||||
with open(prefix + '.tmp', mode='r', encoding='utf-8') as f_in, \
|
||||
open(output_hocr, mode='w', encoding='utf-8') as f_out:
|
||||
for line in f_in:
|
||||
line = regex_nested_single_quotes.sub(
|
||||
@@ -329,10 +333,13 @@ def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,
|
||||
|
||||
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
|
||||
|
||||
# Reminder: test suite tesseract spoofers will break after any changes
|
||||
# to the number of order parameters here
|
||||
args_tesseract.extend([
|
||||
input_image,
|
||||
prefix,
|
||||
'pdf', 'txt'
|
||||
'txt',
|
||||
'pdf',
|
||||
] + tessconfig)
|
||||
|
||||
try:
|
||||
|
||||
@@ -593,14 +593,22 @@ def render_hocr_page(
|
||||
showBoundingboxes=False, invisibleText=True)
|
||||
|
||||
|
||||
def flatten_groups(groups):
|
||||
for obj in groups:
|
||||
if is_iterable_notstr(obj):
|
||||
yield from obj
|
||||
else:
|
||||
yield obj
|
||||
|
||||
|
||||
def render_hocr_debug_page(
|
||||
infiles,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
||||
image = next(ii for ii in infiles if ii.endswith('.image'))
|
||||
hocr = next(ii for ii in flatten_groups(infiles) if ii.endswith('.hocr'))
|
||||
image = next(ii for ii in flatten_groups(infiles) if ii.endswith('.image'))
|
||||
|
||||
pageinfo = get_pageinfo(image, context)
|
||||
dpi = get_page_square_dpi(pageinfo, options)
|
||||
@@ -610,14 +618,6 @@ def render_hocr_debug_page(
|
||||
showBoundingboxes=True, invisibleText=False)
|
||||
|
||||
|
||||
def flatten_groups(groups):
|
||||
for obj in groups:
|
||||
if is_iterable_notstr(obj):
|
||||
yield from obj
|
||||
else:
|
||||
yield obj
|
||||
|
||||
|
||||
def combine_layers(
|
||||
infiles,
|
||||
output_file,
|
||||
|
||||
@@ -36,6 +36,10 @@ def real_tesseract():
|
||||
|
||||
def main():
|
||||
operation = sys.argv[-1]
|
||||
sidecar = False
|
||||
if sys.argv[-2] == 'txt':
|
||||
sidecar = True
|
||||
|
||||
# For anything unexpected operation, defer to real tesseract binary
|
||||
# Currently this includes all use of "--tesseract-config"
|
||||
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
|
||||
@@ -92,16 +96,22 @@ def main():
|
||||
return
|
||||
|
||||
if operation == 'stdout':
|
||||
# tesseract [--options] ... input stdout
|
||||
input_file = sys.argv[-2]
|
||||
output_file = 'stdout'
|
||||
sidecar_file = ''
|
||||
else:
|
||||
input_file = sys.argv[-3]
|
||||
output_file = sys.argv[-2]
|
||||
# tesseract [--options] ... input output txt hocr|pdf
|
||||
input_file = sys.argv[-4]
|
||||
output_file = sys.argv[-3]
|
||||
sidecar_file = sys.argv[-3]
|
||||
|
||||
if operation == 'hocr':
|
||||
output_file += '.hocr'
|
||||
sidecar_file += '.txt'
|
||||
elif operation == 'pdf':
|
||||
output_file += '.pdf'
|
||||
sidecar_file += '.txt'
|
||||
|
||||
with open(input_file, 'rb') as f:
|
||||
m.update(f.read())
|
||||
@@ -112,6 +122,8 @@ def main():
|
||||
print("Tesseract cache hit", file=sys.stderr)
|
||||
if operation != 'stdout':
|
||||
shutil.copy(cache_name, output_file)
|
||||
if sidecar:
|
||||
shutil.copy(cache_name + '.sidecar', sidecar_file)
|
||||
|
||||
# Replicate output
|
||||
with open(cache_name + '.stdout', 'rb') as f:
|
||||
@@ -149,6 +161,8 @@ def main():
|
||||
shutil.copy(output_file, cache_name)
|
||||
else:
|
||||
print("Could not find output file", file=sys.stderr)
|
||||
if sidecar and os.path.exists(sidecar_file):
|
||||
shutil.copy(sidecar_file, cache_name + '.sidecar')
|
||||
else:
|
||||
open(cache_name, 'w').close()
|
||||
|
||||
|
||||
@@ -53,18 +53,22 @@ def main():
|
||||
print('List of available languages (1):\neng', file=sys.stderr)
|
||||
sys.exit(0)
|
||||
elif sys.argv[-1] == 'hocr':
|
||||
inputf = sys.argv[-3]
|
||||
output = sys.argv[-2]
|
||||
inputf = sys.argv[-4]
|
||||
output = sys.argv[-3]
|
||||
with Image.open(inputf) as im, \
|
||||
open(output + '.hocr', 'w', encoding='utf-8') as f:
|
||||
w, h = im.size
|
||||
f.write(HOCR_TEMPLATE.format(str(w), str(h)))
|
||||
with open(output + '.txt', 'w') as f:
|
||||
f.write('')
|
||||
elif sys.argv[-1] == 'pdf':
|
||||
inputf = sys.argv[-3]
|
||||
output = sys.argv[-2]
|
||||
inputf = sys.argv[-4]
|
||||
output = sys.argv[-3]
|
||||
pdf_bytes = img2pdf.convert([inputf], dpi=300)
|
||||
with open(output + '.pdf', 'wb') as f:
|
||||
f.write(pdf_bytes)
|
||||
with open(output + '.txt', 'w') as f:
|
||||
f.write('')
|
||||
elif sys.argv[-1] == 'stdout':
|
||||
inputf = sys.argv[-2]
|
||||
print("""Orientation: 0
|
||||
|
||||
Reference in New Issue
Block a user