From 991db17fdeb212f524e3be499c542b904b3e97af Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 11 Apr 2020 16:03:00 -0700 Subject: [PATCH] Remove Ghostscript-based text extraction While faster than Python based methods, we've outgrown the limited amount of information Ghostscript provides with this feature, and it repeats an analysis we have to do anyway to learn what images are present. --- src/ocrmypdf/_pipeline.py | 6 +- src/ocrmypdf/_sync.py | 6 +- src/ocrmypdf/exec/ghostscript.py | 49 --------- src/ocrmypdf/pdfinfo/ghosttext.py | 102 ------------------ src/ocrmypdf/pdfinfo/info.py | 58 +++------- tests/cache/manifest.jsonl | 1 + .../hocr.bin | Bin 0 -> 1903 bytes .../stderr.bin | Bin 0 -> 55 bytes .../stdout.bin | Bin .../txt.bin | Bin 0 -> 46 bytes tests/test_main.py | 4 +- tests/test_pdfinfo.py | 25 +---- 12 files changed, 23 insertions(+), 228 deletions(-) delete mode 100644 src/ocrmypdf/pdfinfo/ghosttext.py create mode 100644 tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin create mode 100644 tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin create mode 100644 tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stdout.bin create mode 100644 tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/txt.bin diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index bb2f7e18..3be65faa 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -147,11 +147,9 @@ def triage(original_filename, input_file, output_file, options): return output_file -def get_pdfinfo(input_file, detailed_page_analysis=False, progbar=False): +def get_pdfinfo(input_file, progbar=False): try: - return PdfInfo( - input_file, detailed_page_analysis=detailed_page_analysis, progbar=progbar - ) + return PdfInfo(input_file, progbar=progbar) except pikepdf.PasswordError: raise EncryptedPdfError() except pikepdf.PdfError: diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index f54c32a6..9f5c65d8 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -326,11 +326,7 @@ def run_pipeline(options, api=False): ) # Gather pdfinfo and create context - pdfinfo = get_pdfinfo( - origin_pdf, - detailed_page_analysis=options.redo_ocr, - progbar=options.progress_bar, - ) + pdfinfo = get_pdfinfo(origin_pdf, progbar=options.progress_bar) context = PDFContext(options, work_folder, origin_pdf, pdfinfo) diff --git a/src/ocrmypdf/exec/ghostscript.py b/src/ocrmypdf/exec/ghostscript.py index 5c27488f..44b81682 100644 --- a/src/ocrmypdf/exec/ghostscript.py +++ b/src/ocrmypdf/exec/ghostscript.py @@ -83,55 +83,6 @@ def _gs_error_reported(stream) -> bool: return re.search(r'error', stream, flags=re.IGNORECASE) -def extract_text(input_file, pageno=1): - """Use the txtwrite device to get text layout information out - - For details on options of -dTextFormat see - https://www.ghostscript.com/doc/current/VectorDevices.htm#TXT - - Format is like - - - - - - :param pageno: number of page to extract, or all pages if None - :return: XML-ish text representation in bytes - """ - - if pageno is not None: - pages = ['-dFirstPage=%i' % pageno, '-dLastPage=%i' % pageno] - else: - pages = [] - - # Note due to bug https://bugs.ghostscript.com/show_bug.cgi?id=701971 - # Ghostscript <= 9.50 will truncate output unless we write to stdout, so - # don't write to a file. - args_gs = ( - [ - GS, - '-dQUIET', - '-dSAFER', - '-dBATCH', - '-dNOPAUSE', - '-sDEVICE=txtwrite', - '-dTextFormat=0', - ] - + pages - + ['-o', '-', fspath(input_file), "-sstdout=%stderr"] - ) - - try: - p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True) - except CalledProcessError as e: - raise SubprocessOutputError( - 'Ghostscript text extraction failed\n%s\n%s' - % (input_file, e.stderr.decode(errors='replace')) - ) - - return p.stdout - - def rasterize_pdf( input_file: os.PathLike, output_file: os.PathLike, diff --git a/src/ocrmypdf/pdfinfo/ghosttext.py b/src/ocrmypdf/pdfinfo/ghosttext.py deleted file mode 100644 index 07e72f19..00000000 --- a/src/ocrmypdf/pdfinfo/ghosttext.py +++ /dev/null @@ -1,102 +0,0 @@ -# © 2018 James R. Barlow: github.com/jbarlow83 -# -# This file is part of OCRmyPDF. -# -# OCRmyPDF is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# OCRmyPDF is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with OCRmyPDF. If not, see . - -import logging -import re -import xml.etree.ElementTree as ET - -from ..exec import ghostscript - -log = logging.getLogger(__name__) - -# Forgive me for I have sinned -# I am using regular expressions to parse XML. However the XML in this case, -# generated by Ghostscript, is self-consistent enough to be parseable. -regex_remove_char_tags = re.compile( - br""" - ] # anything single character but > - | \">\" # special case: trap ">" - )* - /> # terminate with '/>' -""", - re.VERBOSE, -) - - -def page_get_textblocks(infile, pageno, xmltext, height): - """Get text boxes out of Ghostscript txtwrite xml""" - - root = xmltext - if not hasattr(xmltext, 'findall'): - return [] - - def blocks(): - for span in root.findall('.//span'): - bbox_str = span.attrib['bbox'] - font_size = span.attrib['size'] - pts = [int(pt) for pt in bbox_str.split()] - pts[1] = pts[1] - int(float(font_size) + 0.5) - bbox_topdown = tuple(pts) - bb = bbox_topdown - bbox_bottomup = (bb[0], height - bb[3], bb[2], height - bb[1]) - yield bbox_bottomup - - def joined_blocks(): - prev = None - for bbox in blocks(): - if prev is None: - prev = bbox - if bbox[1] == prev[1] and bbox[3] == prev[3]: - gap = prev[2] - bbox[0] - height = abs(bbox[3] - bbox[1]) - if gap < height: - # Join boxes - prev = (prev[0], prev[1], bbox[2], bbox[3]) - continue - # yield previously joined bboxes and start anew - yield prev - prev = bbox - if prev is not None: - yield prev - - return [block for block in joined_blocks()] - - -def extract_text_xml(infile, pdf, pageno=None): - existing_text = ghostscript.extract_text(infile, pageno=None) - existing_text = regex_remove_char_tags.sub(b' ', existing_text) - - try: - root = ET.fromstringlist([b'\n', existing_text, b'\n']) - page_xml = root.findall('page') - except ET.ParseError as e: - log.error( - "An error occurred while attempting to retrieve existing text in " - "the input file. Will attempt to continue assuming that there is " - "no existing text in the file. The error was:" - ) - log.error(e) - page_xml = [None] * len(pdf.pages) - - page_count_difference = len(pdf.pages) - len(page_xml) - if page_count_difference != 0: - log.error("The number of pages in the input file is inconsistent.") - log.error(f"Expected {len(pdf.pages)}, txtwrite says {len(page_xml)}") - if page_count_difference > 0: - page_xml.extend([None] * page_count_difference) - return page_xml diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index bb232ee5..bd7641c4 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -33,9 +33,7 @@ from pikepdf import PdfMatrix from ocrmypdf._concurrent import exec_progress_pool from ocrmypdf.exceptions import EncryptedPdfError -from ocrmypdf.exec import ghostscript from ocrmypdf.helpers import Resolution -from ocrmypdf.pdfinfo import ghosttext from ocrmypdf.pdfinfo.layout import get_page_analysis, get_text_boxes logger = logging.getLogger() @@ -557,7 +555,7 @@ def simplify_textboxes(miner, textbox_getter): yield TextboxInfo(box.bbox, visible, corrupt) -def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str): +def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike): pageinfo = {} pageinfo['pageno'] = pageno pageinfo['images'] = [] @@ -567,16 +565,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str): width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] - if xmltext is not None: - bboxes = ghosttext.page_get_textblocks( - fspath(infile), pageno, xmltext=xmltext, height=height_pt - ) - pageinfo['bboxes'] = bboxes - else: - pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') - miner = get_page_analysis(infile, pageno, pscript5_mode) - pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) - bboxes = (box.bbox for box in pageinfo['textboxes']) + pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') + miner = get_page_analysis(infile, pageno, pscript5_mode) + pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) + bboxes = (box.bbox for box in pageinfo['textboxes']) pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt) @@ -624,12 +616,12 @@ def _pdf_pageinfo_sync_init(infile): def _pdf_pageinfo_sync(args): global worker_pdf # pylint: disable=global-statement - pageno, infile, xmltext, detailed_analysis = args - page = PageInfo(worker_pdf, pageno, infile, xmltext, detailed_analysis) + pageno, infile = args + page = PageInfo(worker_pdf, pageno, infile) return page -def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar): +def _pdf_pageinfo_concurrent(pdf, infile, progbar): pages = [None] * len(pdf.pages) def update_pageinfo(result, pbar): @@ -637,11 +629,7 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar) pages[page.pageno] = page pbar.update() - contexts = ( - (n, infile, pages_xml[n] if pages_xml else None, detailed_analysis) - for n in range(len(pdf.pages)) - ) - + contexts = ((n, infile) for n in range(len(pdf.pages))) if os.name == 'nt': # We can't parallelize on Windows, because Windows cannot fork. # We are trying to fork, then take advantage of the preloaded pikepdf.Pdf @@ -668,19 +656,12 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar) return pages -def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False): +def _pdf_get_all_pageinfo(infile, progbar=False): pdf = pikepdf.open(infile) # Do not close in this function try: if pdf.is_encrypted: raise EncryptedPdfError() # Triggered by encryption with empty passwd - if detailed_analysis: - pages_xml = None - else: - pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None) - - pages = _pdf_pageinfo_concurrent( - pdf, infile, pages_xml, detailed_analysis, progbar - ) + pages = _pdf_pageinfo_concurrent(pdf, infile, progbar) except Exception: pdf.close() raise @@ -689,11 +670,10 @@ def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False): class PageInfo: - def __init__(self, pdf, pageno, infile, xmltext, detailed_analysis=False): + def __init__(self, pdf, pageno, infile): self._pageno = pageno self._infile = infile - self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, xmltext) - self._detailed_analysis = detailed_analysis + self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile) @property def pageno(self): @@ -705,8 +685,6 @@ class PageInfo: @property def has_corrupt_text(self): - if not self._detailed_analysis: - raise NotImplementedError('Did not do detailed analysis') return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes']) @property @@ -757,7 +735,7 @@ class PageInfo: if 'textboxes' not in self._pageinfo: if visible is not None and corrupt is not None: - raise NotImplementedError('Ghostscript textboxes cannot be classified') + raise NotImplementedError('Incomplete information on textboxes') return self._pageinfo['bboxes'] return ( @@ -792,13 +770,9 @@ class PageInfo: class PdfInfo: """Get summary information about a PDF""" - def __init__(self, infile, detailed_page_analysis=False, progbar=False): + def __init__(self, infile, progbar=False): self._infile = infile - if ghostscript.version() in ('9.52',): - detailed_page_analysis = True # txtwrite doesn't work in these versions - self._pages, pdf = _pdf_get_all_pageinfo( - infile, detailed_page_analysis, progbar=progbar - ) + self._pages, pdf = _pdf_get_all_pageinfo(infile, progbar=progbar) self._needs_rendering = pdf.root.get('/NeedsRendering', False) self._has_acroform = False if '/AcroForm' in pdf.root: diff --git a/tests/cache/manifest.jsonl b/tests/cache/manifest.jsonl index 05a0e86c..23e50166 100644 --- a/tests/cache/manifest.jsonl +++ b/tests/cache/manifest.jsonl @@ -69,3 +69,4 @@ {"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]} {"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]} {"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.1.1 leptonica-1.79.0 libgif 5.2.1 : libjpeg 9d : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.1.0 : libopenjp2 2.3.1 Found AVX2 Found AVX Found FMA Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]} diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin new file mode 100644 index 0000000000000000000000000000000000000000..27e769f68d186d9b136b8d6835bafd0400cf1625 GIT binary patch literal 1903 zcmb7FTW^~%6n^ioIOBzAt$wV? z>J!;1JgNZpG0k6%tv? zhGz+xu4<&^lA$v>EcmaR`ZhH#`cCprE>?E~H=vmAQbY2cBudPAv0P2K57yY;poOEomCx2XHtR3r)6n@{q-ywhUK`WA|Y38BSmXcOu(H5JJk#p&*( zX5A`MgWR%_T=z|hNg<9Pp|l;Rt$x@WzZk~+mtD%7{uHf^N7a!PDf-r`YPq?|Efmw< z4dt~JIKl8E_Z_0%vYj(Aak{O)8AZaBO}&cpoPa9UPN))u9jnz=K2#tea!11(`rq z^{BL9Ce8Ii>;e;9FXY0uY@5O&w9jF;7WHoqZV+0&x+}RTc&F9@!e|;|4OJKl&Vv$)4B9t|)}s+XSUg(r^Uz|5F@GKF_S;fGVte22!T z46d2ISw6j2{xOB`2J;v@q5H5bv14YJSMQEo9N(OzVEtDz?^k9Pa_UDRxSsFmuVxla z8&j$2_o)p2%A^{YC-NzI433TK@0a>9s&HcrWE92^&fU<=`sMc{v;UL%@Ps3v7>pCU mha?8r-IiDEku=VE6EaLY6u~V@Rmr|c$h6_&28{n2+1Vd+pFgSq literal 0 HcmV?d00001 diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin new file mode 100644 index 0000000000000000000000000000000000000000..16b617e5fc5dc7bd2ed182f08ade8647ae1cf593 GIT binary patch literal 55 zcmWGZEiO(iN=z', - b'', - b'', - ] - must_not_match = [b'', b'', b'', b''] - - for s in must_match: - assert rx.match(s) - for s in must_not_match: - assert not rx.match(s) - - def test_vector(resources): filename = resources / 'vector.pdf' pdf = pdfinfo.PdfInfo(filename) @@ -184,16 +168,9 @@ def test_ocr_detection(resources): @pytest.mark.parametrize( 'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf') ) -@pytest.mark.xfail( - ghostscript.version() in ('9.52',), reason="gs 9.52 txtwrite doesn't work" -) def test_corrupt_font_detection(resources, testfile): filename = resources / testfile - with pytest.raises(NotImplementedError): - pdf = pdfinfo.PdfInfo(filename) - pdf[0].has_corrupt_text - - pdf = pdfinfo.PdfInfo(filename, detailed_page_analysis=True) + pdf = pdfinfo.PdfInfo(filename) assert pdf[0].has_corrupt_text