Remove Ghostscript-based text extraction

While faster than Python based methods, we've outgrown the limited amount of information Ghostscript provides with this feature, and it repeats an analysis we have to do anyway to learn what images are present.
2026-05-06 13:47:41 -04:00 · 2020-04-11 16:03:00 -07:00
parent 2c07515907
commit 991db17fde
12 changed files with 23 additions and 228 deletions
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -147,11 +147,9 @@ def triage(original_filename, input_file, output_file, options):
    return output_file


-def get_pdfinfo(input_file, detailed_page_analysis=False, progbar=False):
+def get_pdfinfo(input_file, progbar=False):
    try:
-        return PdfInfo(
-            input_file, detailed_page_analysis=detailed_page_analysis, progbar=progbar
-        )
+        return PdfInfo(input_file, progbar=progbar)
    except pikepdf.PasswordError:
        raise EncryptedPdfError()
    except pikepdf.PdfError:
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@@ -326,11 +326,7 @@ def run_pipeline(options, api=False):
        )

        # Gather pdfinfo and create context
-        pdfinfo = get_pdfinfo(
-            origin_pdf,
-            detailed_page_analysis=options.redo_ocr,
-            progbar=options.progress_bar,
-        )
+        pdfinfo = get_pdfinfo(origin_pdf, progbar=options.progress_bar)

        context = PDFContext(options, work_folder, origin_pdf, pdfinfo)

--- a/src/ocrmypdf/exec/ghostscript.py
+++ b/src/ocrmypdf/exec/ghostscript.py
@@ -83,55 +83,6 @@ def _gs_error_reported(stream) -> bool:
    return re.search(r'error', stream, flags=re.IGNORECASE)


-def extract_text(input_file, pageno=1):
-    """Use the txtwrite device to get text layout information out
-
-    For details on options of -dTextFormat see
-    https://www.ghostscript.com/doc/current/VectorDevices.htm#TXT
-
-    Format is like
-    <page>
-    <line>
-    <span bbox="left top right bottom" font="..." size="...">
-    <char bbox="...." c="X"/>
-
-    :param pageno: number of page to extract, or all pages if None
-    :return: XML-ish text representation in bytes
-    """
-
-    if pageno is not None:
-        pages = ['-dFirstPage=%i' % pageno, '-dLastPage=%i' % pageno]
-    else:
-        pages = []
-
-    # Note due to bug https://bugs.ghostscript.com/show_bug.cgi?id=701971
-    # Ghostscript <= 9.50 will truncate output unless we write to stdout, so
-    # don't write to a file.
-    args_gs = (
-        [
-            GS,
-            '-dQUIET',
-            '-dSAFER',
-            '-dBATCH',
-            '-dNOPAUSE',
-            '-sDEVICE=txtwrite',
-            '-dTextFormat=0',
-        ]
-        + pages
-        + ['-o', '-', fspath(input_file), "-sstdout=%stderr"]
-    )
-
-    try:
-        p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
-    except CalledProcessError as e:
-        raise SubprocessOutputError(
-            'Ghostscript text extraction failed\n%s\n%s'
-            % (input_file, e.stderr.decode(errors='replace'))
-        )
-
-    return p.stdout
-
-
 def rasterize_pdf(
    input_file: os.PathLike,
    output_file: os.PathLike,
--- a/src/ocrmypdf/pdfinfo/ghosttext.py
+++ b/src/ocrmypdf/pdfinfo/ghosttext.py
@@ -1,102 +0,0 @@
-# © 2018 James R. Barlow: github.com/jbarlow83
-#
-# This file is part of OCRmyPDF.
-#
-# OCRmyPDF is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# OCRmyPDF is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
-
-import logging
-import re
-import xml.etree.ElementTree as ET
-
-from ..exec import ghostscript
-
-log = logging.getLogger(__name__)
-
-# Forgive me for I have sinned
-# I am using regular expressions to parse XML. However the XML in this case,
-# generated by Ghostscript, is self-consistent enough to be parseable.
-regex_remove_char_tags = re.compile(
-    br"""
-    <char\b
-    (?:   [^>]   # anything single character but >
-        | \">\"  # special case: trap ">"
-    )*
-    />           # terminate with '/>'
-""",
-    re.VERBOSE,
-)
-
-
-def page_get_textblocks(infile, pageno, xmltext, height):
-    """Get text boxes out of Ghostscript txtwrite xml"""
-
-    root = xmltext
-    if not hasattr(xmltext, 'findall'):
-        return []
-
-    def blocks():
-        for span in root.findall('.//span'):
-            bbox_str = span.attrib['bbox']
-            font_size = span.attrib['size']
-            pts = [int(pt) for pt in bbox_str.split()]
-            pts[1] = pts[1] - int(float(font_size) + 0.5)
-            bbox_topdown = tuple(pts)
-            bb = bbox_topdown
-            bbox_bottomup = (bb[0], height - bb[3], bb[2], height - bb[1])
-            yield bbox_bottomup
-
-    def joined_blocks():
-        prev = None
-        for bbox in blocks():
-            if prev is None:
-                prev = bbox
-            if bbox[1] == prev[1] and bbox[3] == prev[3]:
-                gap = prev[2] - bbox[0]
-                height = abs(bbox[3] - bbox[1])
-                if gap < height:
-                    # Join boxes
-                    prev = (prev[0], prev[1], bbox[2], bbox[3])
-                    continue
-            # yield previously joined bboxes and start anew
-            yield prev
-            prev = bbox
-        if prev is not None:
-            yield prev
-
-    return [block for block in joined_blocks()]
-
-
-def extract_text_xml(infile, pdf, pageno=None):
-    existing_text = ghostscript.extract_text(infile, pageno=None)
-    existing_text = regex_remove_char_tags.sub(b' ', existing_text)
-
-    try:
-        root = ET.fromstringlist([b'<document>\n', existing_text, b'</document>\n'])
-        page_xml = root.findall('page')
-    except ET.ParseError as e:
-        log.error(
-            "An error occurred while attempting to retrieve existing text in "
-            "the input file. Will attempt to continue assuming that there is "
-            "no existing text in the file. The error was:"
-        )
-        log.error(e)
-        page_xml = [None] * len(pdf.pages)
-
-    page_count_difference = len(pdf.pages) - len(page_xml)
-    if page_count_difference != 0:
-        log.error("The number of pages in the input file is inconsistent.")
-        log.error(f"Expected {len(pdf.pages)}, txtwrite says {len(page_xml)}")
-        if page_count_difference > 0:
-            page_xml.extend([None] * page_count_difference)
-    return page_xml
--- a/src/ocrmypdf/pdfinfo/info.py
+++ b/src/ocrmypdf/pdfinfo/info.py
@@ -33,9 +33,7 @@ from pikepdf import PdfMatrix

 from ocrmypdf._concurrent import exec_progress_pool
 from ocrmypdf.exceptions import EncryptedPdfError
-from ocrmypdf.exec import ghostscript
 from ocrmypdf.helpers import Resolution
-from ocrmypdf.pdfinfo import ghosttext
 from ocrmypdf.pdfinfo.layout import get_page_analysis, get_text_boxes

 logger = logging.getLogger()
@@ -557,7 +555,7 @@ def simplify_textboxes(miner, textbox_getter):
        yield TextboxInfo(box.bbox, visible, corrupt)


-def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str):
+def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike):
    pageinfo = {}
    pageinfo['pageno'] = pageno
    pageinfo['images'] = []
@@ -567,16 +565,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str):
    width_pt = mediabox[2] - mediabox[0]
    height_pt = mediabox[3] - mediabox[1]

-    if xmltext is not None:
-        bboxes = ghosttext.page_get_textblocks(
-            fspath(infile), pageno, xmltext=xmltext, height=height_pt
-        )
-        pageinfo['bboxes'] = bboxes
-    else:
-        pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
-        miner = get_page_analysis(infile, pageno, pscript5_mode)
-        pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
-        bboxes = (box.bbox for box in pageinfo['textboxes'])
+    pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
+    miner = get_page_analysis(infile, pageno, pscript5_mode)
+    pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
+    bboxes = (box.bbox for box in pageinfo['textboxes'])

    pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)

@@ -624,12 +616,12 @@ def _pdf_pageinfo_sync_init(infile):

 def _pdf_pageinfo_sync(args):
    global worker_pdf  # pylint: disable=global-statement
-    pageno, infile, xmltext, detailed_analysis = args
-    page = PageInfo(worker_pdf, pageno, infile, xmltext, detailed_analysis)
+    pageno, infile = args
+    page = PageInfo(worker_pdf, pageno, infile)
    return page


-def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar):
+def _pdf_pageinfo_concurrent(pdf, infile, progbar):
    pages = [None] * len(pdf.pages)

    def update_pageinfo(result, pbar):
@@ -637,11 +629,7 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar)
        pages[page.pageno] = page
        pbar.update()

-    contexts = (
-        (n, infile, pages_xml[n] if pages_xml else None, detailed_analysis)
-        for n in range(len(pdf.pages))
-    )
-
+    contexts = ((n, infile) for n in range(len(pdf.pages)))
    if os.name == 'nt':
        # We can't parallelize on Windows, because Windows cannot fork.
        # We are trying to fork, then take advantage of the preloaded pikepdf.Pdf
@@ -668,19 +656,12 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar)
    return pages


-def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False):
+def _pdf_get_all_pageinfo(infile, progbar=False):
    pdf = pikepdf.open(infile)  # Do not close in this function
    try:
        if pdf.is_encrypted:
            raise EncryptedPdfError()  # Triggered by encryption with empty passwd
-        if detailed_analysis:
-            pages_xml = None
-        else:
-            pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None)
-
-        pages = _pdf_pageinfo_concurrent(
-            pdf, infile, pages_xml, detailed_analysis, progbar
-        )
+        pages = _pdf_pageinfo_concurrent(pdf, infile, progbar)
    except Exception:
        pdf.close()
        raise
@@ -689,11 +670,10 @@ def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False):


 class PageInfo:
-    def __init__(self, pdf, pageno, infile, xmltext, detailed_analysis=False):
+    def __init__(self, pdf, pageno, infile):
        self._pageno = pageno
        self._infile = infile
-        self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, xmltext)
-        self._detailed_analysis = detailed_analysis
+        self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile)

    @property
    def pageno(self):
@@ -705,8 +685,6 @@ class PageInfo:

    @property
    def has_corrupt_text(self):
-        if not self._detailed_analysis:
-            raise NotImplementedError('Did not do detailed analysis')
        return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes'])

    @property
@@ -757,7 +735,7 @@ class PageInfo:

        if 'textboxes' not in self._pageinfo:
            if visible is not None and corrupt is not None:
-                raise NotImplementedError('Ghostscript textboxes cannot be classified')
+                raise NotImplementedError('Incomplete information on textboxes')
            return self._pageinfo['bboxes']

        return (
@@ -792,13 +770,9 @@ class PageInfo:
 class PdfInfo:
    """Get summary information about a PDF"""

-    def __init__(self, infile, detailed_page_analysis=False, progbar=False):
+    def __init__(self, infile, progbar=False):
        self._infile = infile
-        if ghostscript.version() in ('9.52',):
-            detailed_page_analysis = True  # txtwrite doesn't work in these versions
-        self._pages, pdf = _pdf_get_all_pageinfo(
-            infile, detailed_page_analysis, progbar=progbar
-        )
+        self._pages, pdf = _pdf_get_all_pageinfo(infile, progbar=progbar)
        self._needs_rendering = pdf.root.get('/NeedsRendering', False)
        self._has_acroform = False
        if '/AcroForm' in pdf.root:
--- a/tests/cache/manifest.jsonl
+++ b/tests/cache/manifest.jsonl
@@ -69,3 +69,4 @@
 {"tesseract_version": "tesseract 4.1.0  leptonica-1.78.0   libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1  Found AVX2  Found AVX  Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
 {"tesseract_version": "tesseract 4.1.0  leptonica-1.78.0   libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1  Found AVX2  Found AVX  Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
 {"tesseract_version": "tesseract 4.1.0  leptonica-1.78.0   libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1  Found AVX2  Found AVX  Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
+{"tesseract_version": "tesseract 4.1.1  leptonica-1.79.0   libgif 5.2.1 : libjpeg 9d : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.1.0 : libopenjp2 2.3.1  Found AVX2  Found AVX  Found FMA  Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
--- a/tests/cache/multipage/-leng__000002_ocr.png__000002_ocr_hocrhocrtxt/hocr.bin
+++ b/tests/cache/multipage/-leng__000002_ocr.png__000002_ocr_hocrhocrtxt/hocr.bin
--- a/tests/cache/multipage/-leng__000002_ocr.png__000002_ocr_hocrhocrtxt/stderr.bin
+++ b/tests/cache/multipage/-leng__000002_ocr.png__000002_ocr_hocrhocrtxt/stderr.bin
--- a/tests/cache/multipage/-leng__000002_ocr.png__000002_ocr_hocrhocrtxt/stdout.bin
+++ b/tests/cache/multipage/-leng__000002_ocr.png__000002_ocr_hocrhocrtxt/stdout.bin
--- a/tests/cache/multipage/-leng__000002_ocr.png__000002_ocr_hocrhocrtxt/txt.bin
+++ b/tests/cache/multipage/-leng__000002_ocr.png__000002_ocr_hocrhocrtxt/txt.bin
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -101,10 +101,10 @@ def test_skip_ocr(spoof_tesseract_cache, resources, outpdf):

 def test_redo_ocr(resources, outpdf):
    in_ = resources / 'graph_ocred.pdf'
-    before = PdfInfo(in_, detailed_page_analysis=True)
+    before = PdfInfo(in_)
    out = outpdf
    out = check_ocrmypdf(in_, out, '--redo-ocr')
-    after = PdfInfo(out, detailed_page_analysis=True)
+    after = PdfInfo(out)
    assert before[0].has_text and after[0].has_text
    assert (
        before[0].get_textareas() != after[0].get_textareas()
--- a/tests/test_pdfinfo.py
+++ b/tests/test_pdfinfo.py
@@ -151,22 +151,6 @@ def test_pickle(resources):
    pickle.dumps(pdf)


-def test_regex():
-    rx = pdfinfo.ghosttext.regex_remove_char_tags
-
-    must_match = [
-        b'<char bbox="0 108 0 108" c="/"/>',
-        b'<char bbox="0 108 0 108" c=">"/>',
-        b'<char bbox="0 108 0 108" c="X"/>',
-    ]
-    must_not_match = [b'<span stuff="c">', b'<span>', b'</span>', b'</page>']
-
-    for s in must_match:
-        assert rx.match(s)
-    for s in must_not_match:
-        assert not rx.match(s)
-
-
 def test_vector(resources):
    filename = resources / 'vector.pdf'
    pdf = pdfinfo.PdfInfo(filename)
@@ -184,16 +168,9 @@ def test_ocr_detection(resources):
@pytest.mark.parametrize(
    'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf')
 )
-@pytest.mark.xfail(
-    ghostscript.version() in ('9.52',), reason="gs 9.52 txtwrite doesn't work"
-)
 def test_corrupt_font_detection(resources, testfile):
    filename = resources / testfile
-    with pytest.raises(NotImplementedError):
-        pdf = pdfinfo.PdfInfo(filename)
-        pdf[0].has_corrupt_text
-
-    pdf = pdfinfo.PdfInfo(filename, detailed_page_analysis=True)
+    pdf = pdfinfo.PdfInfo(filename)
    assert pdf[0].has_corrupt_text