Merge branch 'feature/user-words' into develop

# Conflicts:
#	ocrmypdf/exec/tesseract.py
This commit is contained in:
James R. Barlow
2017-07-20 16:25:20 -07:00
3 changed files with 32 additions and 3 deletions

View File

@@ -281,6 +281,15 @@ advanced.add_argument(
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.")
advanced.add_argument(
'--user-words', metavar='FILE',
help="Specify the location of the Tesseract user words file. This is a "
"list of words Tesseract should consider while performing OCR in "
"addition to its standard language dictionaries. This can improve "
"OCR quality especially for specialized and technical documents.")
advanced.add_argument(
'--user-patterns', metavar='FILE',
help="Specify the location of the Tesseract user patterns file.")
debugging = parser.add_argument_group(
"Debugging",

View File

@@ -213,7 +213,8 @@ def _generate_null_hocr(output_hocr, output_sidecar, image):
def generate_hocr(input_file, output_files, language: list, engine_mode,
tessconfig: list,
timeout: float, pagesegmode: int, log):
timeout: float, pagesegmode: int, user_words, user_patterns,
log):
output_hocr = next(o for o in output_files if o.endswith('.hocr'))
output_sidecar = next(o for o in output_files if o.endswith('.txt'))
@@ -224,10 +225,17 @@ def generate_hocr(input_file, output_files, language: list, engine_mode,
if pagesegmode is not None:
args_tesseract.extend([psm(), str(pagesegmode)])
if user_words:
args_tesseract.extend(['--user-words', user_words])
if user_patterns:
args_tesseract.extend(['--user-patterns', user_patterns])
# Reminder: test suite tesseract spoofers will break after any changes
# to the number of order parameters here
# Tesseract 3.04 requires the order here to be "hocr txt" and will fail
# on "txt hocr"
args_tesseract.extend([
input_file,
prefix,
@@ -285,7 +293,8 @@ def use_skip_page(text_only, skip_pdf, output_pdf, output_text):
def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,
language: list, engine_mode, text_only: bool,
tessconfig: list, timeout: float, pagesegmode: int, log):
tessconfig: list, timeout: float, pagesegmode: int,
user_words, user_patterns, log):
'''Use Tesseract to render a PDF.
input_image -- image to analyze
@@ -308,10 +317,17 @@ def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,
if text_only:
args_tesseract.extend(['-c', 'textonly_pdf=1'])
if user_words:
args_tesseract.extend(['--user-words', user_words])
if user_patterns:
args_tesseract.extend(['--user-patterns', user_patterns])
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
# Reminder: test suite tesseract spoofers will break after any changes
# Reminder: test suite tesseract spoofers might break after any changes
# to the number of order parameters here
args_tesseract.extend([
input_image,
prefix,

View File

@@ -525,6 +525,8 @@ def ocr_tesseract_hocr(
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
user_words=options.user_words,
user_patterns=options.user_patterns,
log=log
)
@@ -755,6 +757,8 @@ def ocr_tesseract_and_render_pdf(
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
user_words=options.user_words,
user_patterns=options.user_patterns,
log=log)