Support “textonly PDF” renderer in Tesseract 3.05.01

This commit is contained in:
James R. Barlow
2017-06-13 10:18:08 -07:00
parent f7ee9e90ce
commit 3232643809
2 changed files with 23 additions and 22 deletions

View File

@@ -327,10 +327,9 @@ def check_options_output(options, log):
" recommended for builds of tesseract 4.00.00alpha older than"
" February 2017. Make sure you are using a recent build.")
if options.debug_rendering and options.pdf_renderer == 'tesseract':
if options.debug_rendering and options.pdf_renderer != 'hocr':
log.info(
"Ignoring --debug-rendering because it is not supported with"
"--pdf-renderer=tesseract.")
"Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
lossless_reconstruction = False
if options.pdf_renderer in ('hocr', 'tess4'):

View File

@@ -63,26 +63,28 @@ def v4():
return (version() >= '4')
@lru_cache(maxsize=1)
def has_textonly_pdf():
if version() == '4.00.00alpha':
# textonly_pdf added during the 4.00.00alpha cycle, so we must test
# more carefully to see if it is present
args_tess = [
get_program('tesseract'),
'--print-parameters'
]
try:
params = check_output(
args_tess, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
print("Could not --print-parameters from tesseract",
file=sys.stderr)
raise MissingDependencyError from e
if 'textonly_pdf' in params:
return True
else:
return v4()
"""Does Tesseract have textonly_pdf capability?
Available in 3.05.01, and v4.00.00alpha since January 2017. Best to
parse the parameter list
"""
args_tess = [
get_program('tesseract'),
'--print-parameters'
]
try:
params = check_output(
args_tess, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
print("Could not --print-parameters from tesseract",
file=sys.stderr)
raise MissingDependencyError from e
if 'textonly_pdf' in params:
return True
return False
def psm():