diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py index 21e2cc4f..c2835a68 100644 --- a/ocrmypdf/exec/tesseract.py +++ b/ocrmypdf/exec/tesseract.py @@ -200,7 +200,7 @@ def page_timedout(log, input_file): log.warning(prefix + " took too long to OCR - skipping") -def _generate_null_hocr(output_hocr, image): +def _generate_null_hocr(output_hocr, output_sidecar, image): """Produce a .hocr file that reports no text detected on a page that is the same size as the input image.""" from PIL import Image @@ -210,12 +210,16 @@ def _generate_null_hocr(output_hocr, image): with open(output_hocr, 'w', encoding="utf-8") as f: f.write(HOCR_TEMPLATE.format(w, h)) + with open(output_sidecar, 'w', encoding='utf-8') as f: + f.write('[skipped page]') -def generate_hocr(input_file, output_hocr, language: list, engine_mode, +def generate_hocr(input_file, output_files, language: list, engine_mode, tessconfig: list, timeout: float, pagesegmode: int, log): + output_hocr = next(o for o in output_files if o.endswith('.hocr')) + output_sidecar = next(o for o in output_files if o.endswith('.txt')) badxml = os.path.splitext(output_hocr)[0] + '.badxml' args_tesseract = tess_base_args(language, engine_mode) @@ -226,7 +230,8 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode, args_tesseract.extend([ input_file, badxml, - 'hocr' + 'hocr', + 'txt' ] + tessconfig) try: log.debug(args_tesseract) @@ -238,13 +243,13 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode, # Temporary workaround to hocrTransform not being able to function if # it does not have a valid hOCR file. page_timedout(log, input_file) - _generate_null_hocr(output_hocr, input_file) + _generate_null_hocr(output_hocr, output_sidecar, input_file) except CalledProcessError as e: tesseract_log_output(log, e.output, input_file) if 'read_params_file: parameter not found' in e.output: raise TesseractConfigError() from e if 'Image too large' in e.output: - _generate_null_hocr(output_hocr, input_file) + _generate_null_hocr(output_hocr, output_sidecar, input_file) return raise e from e @@ -258,6 +263,9 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode, # Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr) shutil.move(badxml + '.hocr', badxml) + if os.path.exists(badxml + '.txt'): + shutil.move(badxml + '.txt', output_sidecar) + # Tesseract 3.03 inserts source filename into hocr file without # escaping it, creating invalid XML and breaking the parser. # As a workaround, rewrite the hocr file, replacing the filename @@ -273,7 +281,10 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode, f_out.write(line) -def use_skip_page(text_only, skip_pdf, output_pdf): +def use_skip_page(text_only, skip_pdf, output_pdf, output_text): + with open(output_text, 'w') as f: + f.write('[skipped page]') + if not text_only: os.symlink(skip_pdf, output_pdf) return @@ -291,14 +302,15 @@ def use_skip_page(text_only, skip_pdf, output_pdf): pdf_out.write(out) -def generate_pdf(input_image, skip_pdf, output_pdf, language: list, - engine_mode, text_only: bool, +def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text, + language: list, engine_mode, text_only: bool, tessconfig: list, timeout: float, pagesegmode: int, log): '''Use Tesseract to render a PDF. input_image -- image to analyze skip_pdf -- if we time out, use this file as output output_pdf -- file to generate + output_text -- OCR text file language -- list of languages to consider engine_mode -- engine mode argument for tess v4 text_only -- enable tesseract text only mode? @@ -315,10 +327,12 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list, if text_only: args_tesseract.extend(['-c', 'textonly_pdf=1']) + prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes + args_tesseract.extend([ input_image, - os.path.splitext(output_pdf)[0], # Tesseract appends suffix - 'pdf' + prefix, + 'pdf', 'txt' ] + tessconfig) try: @@ -326,16 +340,18 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list, stdout = check_output( args_tesseract, close_fds=True, stderr=STDOUT, universal_newlines=True, timeout=timeout) + if os.path.exists(prefix + '.txt'): + shutil.move(prefix + '.txt', output_text) except TimeoutExpired: page_timedout(log, input_image) - use_skip_page(text_only, skip_pdf, output_pdf) + use_skip_page(text_only, skip_pdf, output_pdf, output_text) except CalledProcessError as e: tesseract_log_output(log, e.output, input_image) if 'read_params_file: parameter not found' in e.output: raise TesseractConfigError() from e if 'Image too large' in e.output: - use_skip_page(text_only, skip_pdf, output_pdf) + use_skip_page(text_only, skip_pdf, output_pdf, output_text) return raise e from e else: diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index beafe1b4..9fd314a0 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -483,13 +483,13 @@ def select_ocr_image( def ocr_tesseract_hocr( input_file, - output_file, + output_files, log, context): options = context.get_options() tesseract.generate_hocr( input_file=input_file, - output_hocr=output_file, + output_files=output_files, language=options.language, engine_mode=options.tesseract_oem, tessconfig=options.tesseract_config, @@ -579,12 +579,12 @@ def select_image_layer( def render_hocr_page( - input_file, + infiles, output_file, log, context): options = context.get_options() - hocr = input_file + hocr = next(ii for ii in infiles if ii.endswith('.hocr')) pageinfo = get_pageinfo(hocr, context) dpi = get_page_square_dpi(pageinfo, options) @@ -610,13 +610,23 @@ def render_hocr_debug_page( showBoundingboxes=True, invisibleText=False) +def flatten_groups(groups): + for obj in groups: + if is_iterable_notstr(obj): + yield from obj + else: + yield obj + + def combine_layers( infiles, output_file, log, context): - text = next(ii for ii in infiles if ii.endswith('.text.pdf')) - image = next(ii for ii in infiles if ii.endswith('.image-layer.pdf')) + text = next(ii for ii in flatten_groups(infiles) + if ii.endswith('.text.pdf')) + image = next(ii for ii in flatten_groups(infiles) + if ii.endswith('.image-layer.pdf')) pdf_text = pypdf.PdfFileReader(open(text, "rb")) pdf_image = pypdf.PdfFileReader(open(image, "rb")) @@ -682,21 +692,27 @@ def combine_layers( def ocr_tesseract_and_render_pdf( infiles, - output_file, + outfiles, log, context): options = context.get_options() input_image = next((ii for ii in infiles if ii.endswith('.image')), '') input_pdf = next((ii for ii in infiles if ii.endswith('.pdf'))) + output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf'))) + output_text = next((ii for ii in outfiles if ii.endswith('.txt'))) + if not input_image: # Skipping this page - re_symlink(input_pdf, output_file, log) + re_symlink(input_pdf, output_pdf, log) + with open(output_text, 'w') as f: + f.write('[skipped page]') return tesseract.generate_pdf( input_image=input_image, skip_pdf=input_pdf, - output_pdf=output_file, + output_pdf=output_pdf, + output_text=output_text, language=options.language, engine_mode=options.tesseract_oem, text_only=False, @@ -708,19 +724,23 @@ def ocr_tesseract_and_render_pdf( def ocr_tesseract_textonly_pdf( infiles, - output_file, + outfiles, log, context): options = context.get_options() input_image = next((ii for ii in infiles if ii.endswith('.ocr.png')), '') if not input_image: raise ValueError("No image rendered?") - skip_pdf = next((ii for ii in infiles if ii.endswith('.pdf'))) + + output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf'))) + output_text = next((ii for ii in outfiles if ii.endswith('.txt'))) + tesseract.generate_pdf( input_image=input_image, skip_pdf=skip_pdf, - output_pdf=output_file, + output_pdf=output_pdf, + output_text=output_text, language=options.language, engine_mode=options.tesseract_oem, text_only=True, @@ -787,7 +807,7 @@ def skip_page( def merge_pages_ghostscript( - input_files, + input_files_groups, output_file, log, context): @@ -805,6 +825,8 @@ def merge_pages_ghostscript( key += 1 return key + input_files = (f for f in flatten_groups(input_files_groups) + if not f.endswith('.txt')) pdf_pages = sorted(input_files, key=input_file_order) log.debug("Final pages: " + "\n".join(pdf_pages)) ghostscript.generate_pdfa( @@ -813,11 +835,14 @@ def merge_pages_ghostscript( def merge_pages_qpdf( - input_files, + input_files_groups, output_file, log, context): options = context.get_options() + + input_files = list(f for f in flatten_groups(input_files_groups) + if not f.endswith('.txt')) metadata_file = next( (ii for ii in input_files if ii.endswith('.repaired.pdf'))) input_files.remove(metadata_file) @@ -851,6 +876,31 @@ def merge_pages_qpdf( qpdf.merge(pdf_pages, output_file) +def merge_sidecars( + input_files_groups, + output_file, + log, + context): + options = context.get_options() + + txt_files = sorted(f for f in flatten_groups(input_files_groups) + if f.endswith('.txt')) + + def write_pages(stream): + for page_number, txt_file in enumerate(txt_files): + if page_number != 0: + stream.write('\f') # Form feed between pages + with open(txt_file, 'r') as in_: + stream.write(in_.read()) + + if output_file == '-': + write_pages(sys.stdout) + sys.stdout.flush() + else: + with open(output_file, 'w', encoding='utf-8') as out: + write_pages(out) + + def copy_final( input_files, output_file, @@ -955,7 +1005,7 @@ def build_pipeline(options, work_folder, log, context): task_func=ocr_tesseract_hocr, input=task_select_ocr_image, filter=suffix(".ocr.png"), - output=".hocr", + output=[".hocr", ".txt"], extras=[log, context]) task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"') task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr') @@ -987,8 +1037,8 @@ def build_pipeline(options, work_folder, log, context): task_render_hocr_page = main_pipeline.transform( task_func=render_hocr_page, input=task_ocr_tesseract_hocr, - filter=suffix('.hocr'), - output='.text.pdf', + filter=regex(r".*/(\d{6})(?:\.hocr)"), + output=os.path.join(work_folder, r'\1.text.pdf'), extras=[log, context]) task_render_hocr_page.graphviz(fillcolor='"#00cc66"') task_render_hocr_page.active_if(options.pdf_renderer == 'hocr') @@ -1008,7 +1058,8 @@ def build_pipeline(options, work_folder, log, context): task_func=ocr_tesseract_textonly_pdf, input=[task_select_ocr_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"), - output=os.path.join(work_folder, r'\1.text.pdf'), + output=[os.path.join(work_folder, r'\1.text.pdf'), + os.path.join(work_folder, r'\1.text.txt')], extras=[log, context]) task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"') task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4') @@ -1031,7 +1082,8 @@ def build_pipeline(options, work_folder, log, context): task_func=ocr_tesseract_and_render_pdf, input=[task_select_visible_page_image, task_orient_page], filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"), - output=os.path.join(work_folder, r'\1.rendered.pdf'), + output=[os.path.join(work_folder, r'\1.rendered.pdf'), + os.path.join(work_folder, r'\1.rendered.txt')], extras=[log, context]) task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"') task_ocr_tesseract_and_render_pdf.active_if(options.pdf_renderer == 'tesseract') @@ -1080,6 +1132,15 @@ def build_pipeline(options, work_folder, log, context): extras=[log, context]) task_merge_pages_qpdf.active_if(options.output_type == 'pdf') + task_merge_sidecars = main_pipeline.merge( + task_func=merge_sidecars, + input=[task_ocr_tesseract_hocr, + task_ocr_tesseract_and_render_pdf, + task_ocr_tesseract_textonly_pdf], + output=options.sidecar, + extras=[log, context]) + task_merge_sidecars.active_if(options.sidecar) + # Finalize task_copy_final = main_pipeline.merge( task_func=copy_final, diff --git a/tests/test_main.py b/tests/test_main.py index ddca6fba..85b90564 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -147,14 +147,18 @@ def test_remove_background(spoof_tesseract_noop, resources, outdir): @pytest.mark.parametrize("output_type", ['pdf', 'pdfa']) def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type, resources, outdir): + outfile = outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer) check_ocrmypdf( resources / pdf, - outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer), + outfile, '-dc', '-v', '1', '--output-type', output_type, + '--sidecar', '--pdf-renderer', renderer, env=spoof_tesseract_cache) + assert outfile.with_suffix('.pdf.txt').exists() + @pytest.mark.parametrize("output_type", [ 'pdfa', 'pdf' diff --git a/tests/test_tess4.py b/tests/test_tess4.py index 4cc9942c..5a706178 100644 --- a/tests/test_tess4.py +++ b/tests/test_tess4.py @@ -53,6 +53,7 @@ def test_textonly_pdf(ensure_tess4, resources, outdir): check_ocrmypdf( resources / 'linn.pdf', outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4', + '--sidecar', 'foo', env=ensure_tess4)