mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-05 13:16:55 -04:00
Implement sidecar text files (#126)
This commit is contained in:
@@ -200,7 +200,7 @@ def page_timedout(log, input_file):
|
||||
log.warning(prefix + " took too long to OCR - skipping")
|
||||
|
||||
|
||||
def _generate_null_hocr(output_hocr, image):
|
||||
def _generate_null_hocr(output_hocr, output_sidecar, image):
|
||||
"""Produce a .hocr file that reports no text detected on a page that is
|
||||
the same size as the input image."""
|
||||
from PIL import Image
|
||||
@@ -210,12 +210,16 @@ def _generate_null_hocr(output_hocr, image):
|
||||
|
||||
with open(output_hocr, 'w', encoding="utf-8") as f:
|
||||
f.write(HOCR_TEMPLATE.format(w, h))
|
||||
with open(output_sidecar, 'w', encoding='utf-8') as f:
|
||||
f.write('[skipped page]')
|
||||
|
||||
|
||||
def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
def generate_hocr(input_file, output_files, language: list, engine_mode,
|
||||
tessconfig: list,
|
||||
timeout: float, pagesegmode: int, log):
|
||||
|
||||
output_hocr = next(o for o in output_files if o.endswith('.hocr'))
|
||||
output_sidecar = next(o for o in output_files if o.endswith('.txt'))
|
||||
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
|
||||
|
||||
args_tesseract = tess_base_args(language, engine_mode)
|
||||
@@ -226,7 +230,8 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
args_tesseract.extend([
|
||||
input_file,
|
||||
badxml,
|
||||
'hocr'
|
||||
'hocr',
|
||||
'txt'
|
||||
] + tessconfig)
|
||||
try:
|
||||
log.debug(args_tesseract)
|
||||
@@ -238,13 +243,13 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
# Temporary workaround to hocrTransform not being able to function if
|
||||
# it does not have a valid hOCR file.
|
||||
page_timedout(log, input_file)
|
||||
_generate_null_hocr(output_hocr, input_file)
|
||||
_generate_null_hocr(output_hocr, output_sidecar, input_file)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_file)
|
||||
if 'read_params_file: parameter not found' in e.output:
|
||||
raise TesseractConfigError() from e
|
||||
if 'Image too large' in e.output:
|
||||
_generate_null_hocr(output_hocr, input_file)
|
||||
_generate_null_hocr(output_hocr, output_sidecar, input_file)
|
||||
return
|
||||
|
||||
raise e from e
|
||||
@@ -258,6 +263,9 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
|
||||
shutil.move(badxml + '.hocr', badxml)
|
||||
|
||||
if os.path.exists(badxml + '.txt'):
|
||||
shutil.move(badxml + '.txt', output_sidecar)
|
||||
|
||||
# Tesseract 3.03 inserts source filename into hocr file without
|
||||
# escaping it, creating invalid XML and breaking the parser.
|
||||
# As a workaround, rewrite the hocr file, replacing the filename
|
||||
@@ -273,7 +281,10 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
f_out.write(line)
|
||||
|
||||
|
||||
def use_skip_page(text_only, skip_pdf, output_pdf):
|
||||
def use_skip_page(text_only, skip_pdf, output_pdf, output_text):
|
||||
with open(output_text, 'w') as f:
|
||||
f.write('[skipped page]')
|
||||
|
||||
if not text_only:
|
||||
os.symlink(skip_pdf, output_pdf)
|
||||
return
|
||||
@@ -291,14 +302,15 @@ def use_skip_page(text_only, skip_pdf, output_pdf):
|
||||
pdf_out.write(out)
|
||||
|
||||
|
||||
def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
|
||||
engine_mode, text_only: bool,
|
||||
def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,
|
||||
language: list, engine_mode, text_only: bool,
|
||||
tessconfig: list, timeout: float, pagesegmode: int, log):
|
||||
'''Use Tesseract to render a PDF.
|
||||
|
||||
input_image -- image to analyze
|
||||
skip_pdf -- if we time out, use this file as output
|
||||
output_pdf -- file to generate
|
||||
output_text -- OCR text file
|
||||
language -- list of languages to consider
|
||||
engine_mode -- engine mode argument for tess v4
|
||||
text_only -- enable tesseract text only mode?
|
||||
@@ -315,10 +327,12 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
|
||||
if text_only:
|
||||
args_tesseract.extend(['-c', 'textonly_pdf=1'])
|
||||
|
||||
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
|
||||
|
||||
args_tesseract.extend([
|
||||
input_image,
|
||||
os.path.splitext(output_pdf)[0], # Tesseract appends suffix
|
||||
'pdf'
|
||||
prefix,
|
||||
'pdf', 'txt'
|
||||
] + tessconfig)
|
||||
|
||||
try:
|
||||
@@ -326,16 +340,18 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
|
||||
stdout = check_output(
|
||||
args_tesseract, close_fds=True, stderr=STDOUT,
|
||||
universal_newlines=True, timeout=timeout)
|
||||
if os.path.exists(prefix + '.txt'):
|
||||
shutil.move(prefix + '.txt', output_text)
|
||||
except TimeoutExpired:
|
||||
page_timedout(log, input_image)
|
||||
use_skip_page(text_only, skip_pdf, output_pdf)
|
||||
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_image)
|
||||
if 'read_params_file: parameter not found' in e.output:
|
||||
raise TesseractConfigError() from e
|
||||
|
||||
if 'Image too large' in e.output:
|
||||
use_skip_page(text_only, skip_pdf, output_pdf)
|
||||
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
|
||||
return
|
||||
raise e from e
|
||||
else:
|
||||
|
||||
@@ -483,13 +483,13 @@ def select_ocr_image(
|
||||
|
||||
def ocr_tesseract_hocr(
|
||||
input_file,
|
||||
output_file,
|
||||
output_files,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
tesseract.generate_hocr(
|
||||
input_file=input_file,
|
||||
output_hocr=output_file,
|
||||
output_files=output_files,
|
||||
language=options.language,
|
||||
engine_mode=options.tesseract_oem,
|
||||
tessconfig=options.tesseract_config,
|
||||
@@ -579,12 +579,12 @@ def select_image_layer(
|
||||
|
||||
|
||||
def render_hocr_page(
|
||||
input_file,
|
||||
infiles,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
hocr = input_file
|
||||
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
||||
pageinfo = get_pageinfo(hocr, context)
|
||||
dpi = get_page_square_dpi(pageinfo, options)
|
||||
|
||||
@@ -610,13 +610,23 @@ def render_hocr_debug_page(
|
||||
showBoundingboxes=True, invisibleText=False)
|
||||
|
||||
|
||||
def flatten_groups(groups):
|
||||
for obj in groups:
|
||||
if is_iterable_notstr(obj):
|
||||
yield from obj
|
||||
else:
|
||||
yield obj
|
||||
|
||||
|
||||
def combine_layers(
|
||||
infiles,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
text = next(ii for ii in infiles if ii.endswith('.text.pdf'))
|
||||
image = next(ii for ii in infiles if ii.endswith('.image-layer.pdf'))
|
||||
text = next(ii for ii in flatten_groups(infiles)
|
||||
if ii.endswith('.text.pdf'))
|
||||
image = next(ii for ii in flatten_groups(infiles)
|
||||
if ii.endswith('.image-layer.pdf'))
|
||||
|
||||
pdf_text = pypdf.PdfFileReader(open(text, "rb"))
|
||||
pdf_image = pypdf.PdfFileReader(open(image, "rb"))
|
||||
@@ -682,21 +692,27 @@ def combine_layers(
|
||||
|
||||
def ocr_tesseract_and_render_pdf(
|
||||
infiles,
|
||||
output_file,
|
||||
outfiles,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
input_image = next((ii for ii in infiles if ii.endswith('.image')), '')
|
||||
input_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
|
||||
output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf')))
|
||||
output_text = next((ii for ii in outfiles if ii.endswith('.txt')))
|
||||
|
||||
if not input_image:
|
||||
# Skipping this page
|
||||
re_symlink(input_pdf, output_file, log)
|
||||
re_symlink(input_pdf, output_pdf, log)
|
||||
with open(output_text, 'w') as f:
|
||||
f.write('[skipped page]')
|
||||
return
|
||||
|
||||
tesseract.generate_pdf(
|
||||
input_image=input_image,
|
||||
skip_pdf=input_pdf,
|
||||
output_pdf=output_file,
|
||||
output_pdf=output_pdf,
|
||||
output_text=output_text,
|
||||
language=options.language,
|
||||
engine_mode=options.tesseract_oem,
|
||||
text_only=False,
|
||||
@@ -708,19 +724,23 @@ def ocr_tesseract_and_render_pdf(
|
||||
|
||||
def ocr_tesseract_textonly_pdf(
|
||||
infiles,
|
||||
output_file,
|
||||
outfiles,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
input_image = next((ii for ii in infiles if ii.endswith('.ocr.png')), '')
|
||||
if not input_image:
|
||||
raise ValueError("No image rendered?")
|
||||
|
||||
skip_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
|
||||
|
||||
output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf')))
|
||||
output_text = next((ii for ii in outfiles if ii.endswith('.txt')))
|
||||
|
||||
tesseract.generate_pdf(
|
||||
input_image=input_image,
|
||||
skip_pdf=skip_pdf,
|
||||
output_pdf=output_file,
|
||||
output_pdf=output_pdf,
|
||||
output_text=output_text,
|
||||
language=options.language,
|
||||
engine_mode=options.tesseract_oem,
|
||||
text_only=True,
|
||||
@@ -787,7 +807,7 @@ def skip_page(
|
||||
|
||||
|
||||
def merge_pages_ghostscript(
|
||||
input_files,
|
||||
input_files_groups,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
@@ -805,6 +825,8 @@ def merge_pages_ghostscript(
|
||||
key += 1
|
||||
return key
|
||||
|
||||
input_files = (f for f in flatten_groups(input_files_groups)
|
||||
if not f.endswith('.txt'))
|
||||
pdf_pages = sorted(input_files, key=input_file_order)
|
||||
log.debug("Final pages: " + "\n".join(pdf_pages))
|
||||
ghostscript.generate_pdfa(
|
||||
@@ -813,11 +835,14 @@ def merge_pages_ghostscript(
|
||||
|
||||
|
||||
def merge_pages_qpdf(
|
||||
input_files,
|
||||
input_files_groups,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
|
||||
input_files = list(f for f in flatten_groups(input_files_groups)
|
||||
if not f.endswith('.txt'))
|
||||
metadata_file = next(
|
||||
(ii for ii in input_files if ii.endswith('.repaired.pdf')))
|
||||
input_files.remove(metadata_file)
|
||||
@@ -851,6 +876,31 @@ def merge_pages_qpdf(
|
||||
qpdf.merge(pdf_pages, output_file)
|
||||
|
||||
|
||||
def merge_sidecars(
|
||||
input_files_groups,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
|
||||
txt_files = sorted(f for f in flatten_groups(input_files_groups)
|
||||
if f.endswith('.txt'))
|
||||
|
||||
def write_pages(stream):
|
||||
for page_number, txt_file in enumerate(txt_files):
|
||||
if page_number != 0:
|
||||
stream.write('\f') # Form feed between pages
|
||||
with open(txt_file, 'r') as in_:
|
||||
stream.write(in_.read())
|
||||
|
||||
if output_file == '-':
|
||||
write_pages(sys.stdout)
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
with open(output_file, 'w', encoding='utf-8') as out:
|
||||
write_pages(out)
|
||||
|
||||
|
||||
def copy_final(
|
||||
input_files,
|
||||
output_file,
|
||||
@@ -955,7 +1005,7 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_func=ocr_tesseract_hocr,
|
||||
input=task_select_ocr_image,
|
||||
filter=suffix(".ocr.png"),
|
||||
output=".hocr",
|
||||
output=[".hocr", ".txt"],
|
||||
extras=[log, context])
|
||||
task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
|
||||
task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
|
||||
@@ -987,8 +1037,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_render_hocr_page = main_pipeline.transform(
|
||||
task_func=render_hocr_page,
|
||||
input=task_ocr_tesseract_hocr,
|
||||
filter=suffix('.hocr'),
|
||||
output='.text.pdf',
|
||||
filter=regex(r".*/(\d{6})(?:\.hocr)"),
|
||||
output=os.path.join(work_folder, r'\1.text.pdf'),
|
||||
extras=[log, context])
|
||||
task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
|
||||
task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
|
||||
@@ -1008,7 +1058,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_func=ocr_tesseract_textonly_pdf,
|
||||
input=[task_select_ocr_image, task_orient_page],
|
||||
filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
|
||||
output=os.path.join(work_folder, r'\1.text.pdf'),
|
||||
output=[os.path.join(work_folder, r'\1.text.pdf'),
|
||||
os.path.join(work_folder, r'\1.text.txt')],
|
||||
extras=[log, context])
|
||||
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
|
||||
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4')
|
||||
@@ -1031,7 +1082,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_func=ocr_tesseract_and_render_pdf,
|
||||
input=[task_select_visible_page_image, task_orient_page],
|
||||
filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
|
||||
output=os.path.join(work_folder, r'\1.rendered.pdf'),
|
||||
output=[os.path.join(work_folder, r'\1.rendered.pdf'),
|
||||
os.path.join(work_folder, r'\1.rendered.txt')],
|
||||
extras=[log, context])
|
||||
task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
|
||||
task_ocr_tesseract_and_render_pdf.active_if(options.pdf_renderer == 'tesseract')
|
||||
@@ -1080,6 +1132,15 @@ def build_pipeline(options, work_folder, log, context):
|
||||
extras=[log, context])
|
||||
task_merge_pages_qpdf.active_if(options.output_type == 'pdf')
|
||||
|
||||
task_merge_sidecars = main_pipeline.merge(
|
||||
task_func=merge_sidecars,
|
||||
input=[task_ocr_tesseract_hocr,
|
||||
task_ocr_tesseract_and_render_pdf,
|
||||
task_ocr_tesseract_textonly_pdf],
|
||||
output=options.sidecar,
|
||||
extras=[log, context])
|
||||
task_merge_sidecars.active_if(options.sidecar)
|
||||
|
||||
# Finalize
|
||||
task_copy_final = main_pipeline.merge(
|
||||
task_func=copy_final,
|
||||
|
||||
@@ -147,14 +147,18 @@ def test_remove_background(spoof_tesseract_noop, resources, outdir):
|
||||
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
|
||||
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type,
|
||||
resources, outdir):
|
||||
outfile = outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer)
|
||||
check_ocrmypdf(
|
||||
resources / pdf,
|
||||
outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer),
|
||||
outfile,
|
||||
'-dc',
|
||||
'-v', '1',
|
||||
'--output-type', output_type,
|
||||
'--sidecar',
|
||||
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
||||
|
||||
assert outfile.with_suffix('.pdf.txt').exists()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("output_type", [
|
||||
'pdfa', 'pdf'
|
||||
|
||||
@@ -53,6 +53,7 @@ def test_textonly_pdf(ensure_tess4, resources, outdir):
|
||||
check_ocrmypdf(
|
||||
resources / 'linn.pdf',
|
||||
outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4',
|
||||
'--sidecar', 'foo',
|
||||
env=ensure_tess4)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user