mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
Support “textonly PDF” renderer in Tesseract 3.05.01
This commit is contained in:
@@ -327,10 +327,9 @@ def check_options_output(options, log):
|
||||
" recommended for builds of tesseract 4.00.00alpha older than"
|
||||
" February 2017. Make sure you are using a recent build.")
|
||||
|
||||
if options.debug_rendering and options.pdf_renderer == 'tesseract':
|
||||
if options.debug_rendering and options.pdf_renderer != 'hocr':
|
||||
log.info(
|
||||
"Ignoring --debug-rendering because it is not supported with"
|
||||
"--pdf-renderer=tesseract.")
|
||||
"Ignoring --debug-rendering because it requires --pdf-renderer=hocr")
|
||||
|
||||
lossless_reconstruction = False
|
||||
if options.pdf_renderer in ('hocr', 'tess4'):
|
||||
|
||||
@@ -63,26 +63,28 @@ def v4():
|
||||
return (version() >= '4')
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def has_textonly_pdf():
|
||||
if version() == '4.00.00alpha':
|
||||
# textonly_pdf added during the 4.00.00alpha cycle, so we must test
|
||||
# more carefully to see if it is present
|
||||
args_tess = [
|
||||
get_program('tesseract'),
|
||||
'--print-parameters'
|
||||
]
|
||||
try:
|
||||
params = check_output(
|
||||
args_tess, close_fds=True, universal_newlines=True,
|
||||
stderr=STDOUT)
|
||||
except CalledProcessError as e:
|
||||
print("Could not --print-parameters from tesseract",
|
||||
file=sys.stderr)
|
||||
raise MissingDependencyError from e
|
||||
if 'textonly_pdf' in params:
|
||||
return True
|
||||
else:
|
||||
return v4()
|
||||
"""Does Tesseract have textonly_pdf capability?
|
||||
|
||||
Available in 3.05.01, and v4.00.00alpha since January 2017. Best to
|
||||
parse the parameter list
|
||||
"""
|
||||
args_tess = [
|
||||
get_program('tesseract'),
|
||||
'--print-parameters'
|
||||
]
|
||||
try:
|
||||
params = check_output(
|
||||
args_tess, close_fds=True, universal_newlines=True,
|
||||
stderr=STDOUT)
|
||||
except CalledProcessError as e:
|
||||
print("Could not --print-parameters from tesseract",
|
||||
file=sys.stderr)
|
||||
raise MissingDependencyError from e
|
||||
if 'textonly_pdf' in params:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def psm():
|
||||
|
||||
Reference in New Issue
Block a user