From 991db17fdeb212f524e3be499c542b904b3e97af Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Sat, 11 Apr 2020 16:03:00 -0700
Subject: [PATCH] Remove Ghostscript-based text extraction

While faster than Python based methods, we've outgrown the limited
amount of information Ghostscript provides with this feature, and it
repeats an analysis we have to do anyway to learn what images are
present.
---
 src/ocrmypdf/_pipeline.py                     |   6 +-
 src/ocrmypdf/_sync.py                         |   6 +-
 src/ocrmypdf/exec/ghostscript.py              |  49 ---------
 src/ocrmypdf/pdfinfo/ghosttext.py             | 102 ------------------
 src/ocrmypdf/pdfinfo/info.py                  |  58 +++-------
 tests/cache/manifest.jsonl                    |   1 +
 .../hocr.bin                                  | Bin 0 -> 1903 bytes
 .../stderr.bin                                | Bin 0 -> 55 bytes
 .../stdout.bin                                | Bin
 .../txt.bin                                   | Bin 0 -> 46 bytes
 tests/test_main.py                            |   4 +-
 tests/test_pdfinfo.py                         |  25 +----
 12 files changed, 23 insertions(+), 228 deletions(-)
 delete mode 100644 src/ocrmypdf/pdfinfo/ghosttext.py
 create mode 100644 tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin
 create mode 100644 tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin
 create mode 100644 tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stdout.bin
 create mode 100644 tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/txt.bin
diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
index bb2f7e18..3be65faa 100644
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -147,11 +147,9 @@ def triage(original_filename, input_file, output_file, options):
     return output_file
 
 
-def get_pdfinfo(input_file, detailed_page_analysis=False, progbar=False):
+def get_pdfinfo(input_file, progbar=False):
     try:
-        return PdfInfo(
-            input_file, detailed_page_analysis=detailed_page_analysis, progbar=progbar
-        )
+        return PdfInfo(input_file, progbar=progbar)
     except pikepdf.PasswordError:
         raise EncryptedPdfError()
     except pikepdf.PdfError:
diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py
index f54c32a6..9f5c65d8 100644
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@@ -326,11 +326,7 @@ def run_pipeline(options, api=False):
         )
 
         # Gather pdfinfo and create context
-        pdfinfo = get_pdfinfo(
-            origin_pdf,
-            detailed_page_analysis=options.redo_ocr,
-            progbar=options.progress_bar,
-        )
+        pdfinfo = get_pdfinfo(origin_pdf, progbar=options.progress_bar)
 
         context = PDFContext(options, work_folder, origin_pdf, pdfinfo)
 
diff --git a/src/ocrmypdf/exec/ghostscript.py b/src/ocrmypdf/exec/ghostscript.py
index 5c27488f..44b81682 100644
--- a/src/ocrmypdf/exec/ghostscript.py
+++ b/src/ocrmypdf/exec/ghostscript.py
@@ -83,55 +83,6 @@ def _gs_error_reported(stream) -> bool:
     return re.search(r'error', stream, flags=re.IGNORECASE)
 
 
-def extract_text(input_file, pageno=1):
-    """Use the txtwrite device to get text layout information out
-
-    For details on options of -dTextFormat see
-    https://www.ghostscript.com/doc/current/VectorDevices.htm#TXT
-
-    Format is like
-    <page>
-    <line>
-    <span bbox="left top right bottom" font="..." size="...">
-    <char bbox="...." c="X"/>
-
-    :param pageno: number of page to extract, or all pages if None
-    :return: XML-ish text representation in bytes
-    """
-
-    if pageno is not None:
-        pages = ['-dFirstPage=%i' % pageno, '-dLastPage=%i' % pageno]
-    else:
-        pages = []
-
-    # Note due to bug https://bugs.ghostscript.com/show_bug.cgi?id=701971
-    # Ghostscript <= 9.50 will truncate output unless we write to stdout, so
-    # don't write to a file.
-    args_gs = (
-        [
-            GS,
-            '-dQUIET',
-            '-dSAFER',
-            '-dBATCH',
-            '-dNOPAUSE',
-            '-sDEVICE=txtwrite',
-            '-dTextFormat=0',
-        ]
-        + pages
-        + ['-o', '-', fspath(input_file), "-sstdout=%stderr"]
-    )
-
-    try:
-        p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
-    except CalledProcessError as e:
-        raise SubprocessOutputError(
-            'Ghostscript text extraction failed\n%s\n%s'
-            % (input_file, e.stderr.decode(errors='replace'))
-        )
-
-    return p.stdout
-
-
 def rasterize_pdf(
     input_file: os.PathLike,
     output_file: os.PathLike,
diff --git a/src/ocrmypdf/pdfinfo/ghosttext.py b/src/ocrmypdf/pdfinfo/ghosttext.py
deleted file mode 100644
index 07e72f19..00000000
--- a/src/ocrmypdf/pdfinfo/ghosttext.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# © 2018 James R. Barlow: github.com/jbarlow83
-#
-# This file is part of OCRmyPDF.
-#
-# OCRmyPDF is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# OCRmyPDF is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
-
-import logging
-import re
-import xml.etree.ElementTree as ET
-
-from ..exec import ghostscript
-
-log = logging.getLogger(__name__)
-
-# Forgive me for I have sinned
-# I am using regular expressions to parse XML. However the XML in this case,
-# generated by Ghostscript, is self-consistent enough to be parseable.
-regex_remove_char_tags = re.compile(
-    br"""
-    <char\b
-    (?:   [^>]   # anything single character but >
-        | \">\"  # special case: trap ">"
-    )*
-    />           # terminate with '/>'
-""",
-    re.VERBOSE,
-)
-
-
-def page_get_textblocks(infile, pageno, xmltext, height):
-    """Get text boxes out of Ghostscript txtwrite xml"""
-
-    root = xmltext
-    if not hasattr(xmltext, 'findall'):
-        return []
-
-    def blocks():
-        for span in root.findall('.//span'):
-            bbox_str = span.attrib['bbox']
-            font_size = span.attrib['size']
-            pts = [int(pt) for pt in bbox_str.split()]
-            pts[1] = pts[1] - int(float(font_size) + 0.5)
-            bbox_topdown = tuple(pts)
-            bb = bbox_topdown
-            bbox_bottomup = (bb[0], height - bb[3], bb[2], height - bb[1])
-            yield bbox_bottomup
-
-    def joined_blocks():
-        prev = None
-        for bbox in blocks():
-            if prev is None:
-                prev = bbox
-            if bbox[1] == prev[1] and bbox[3] == prev[3]:
-                gap = prev[2] - bbox[0]
-                height = abs(bbox[3] - bbox[1])
-                if gap < height:
-                    # Join boxes
-                    prev = (prev[0], prev[1], bbox[2], bbox[3])
-                    continue
-            # yield previously joined bboxes and start anew
-            yield prev
-            prev = bbox
-        if prev is not None:
-            yield prev
-
-    return [block for block in joined_blocks()]
-
-
-def extract_text_xml(infile, pdf, pageno=None):
-    existing_text = ghostscript.extract_text(infile, pageno=None)
-    existing_text = regex_remove_char_tags.sub(b' ', existing_text)
-
-    try:
-        root = ET.fromstringlist([b'<document>\n', existing_text, b'</document>\n'])
-        page_xml = root.findall('page')
-    except ET.ParseError as e:
-        log.error(
-            "An error occurred while attempting to retrieve existing text in "
-            "the input file. Will attempt to continue assuming that there is "
-            "no existing text in the file. The error was:"
-        )
-        log.error(e)
-        page_xml = [None] * len(pdf.pages)
-
-    page_count_difference = len(pdf.pages) - len(page_xml)
-    if page_count_difference != 0:
-        log.error("The number of pages in the input file is inconsistent.")
-        log.error(f"Expected {len(pdf.pages)}, txtwrite says {len(page_xml)}")
-        if page_count_difference > 0:
-            page_xml.extend([None] * page_count_difference)
-    return page_xml
diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py
index bb232ee5..bd7641c4 100644
--- a/src/ocrmypdf/pdfinfo/info.py
+++ b/src/ocrmypdf/pdfinfo/info.py
@@ -33,9 +33,7 @@ from pikepdf import PdfMatrix
 
 from ocrmypdf._concurrent import exec_progress_pool
 from ocrmypdf.exceptions import EncryptedPdfError
-from ocrmypdf.exec import ghostscript
 from ocrmypdf.helpers import Resolution
-from ocrmypdf.pdfinfo import ghosttext
 from ocrmypdf.pdfinfo.layout import get_page_analysis, get_text_boxes
 
 logger = logging.getLogger()
@@ -557,7 +555,7 @@ def simplify_textboxes(miner, textbox_getter):
         yield TextboxInfo(box.bbox, visible, corrupt)
 
 
-def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str):
+def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike):
     pageinfo = {}
     pageinfo['pageno'] = pageno
     pageinfo['images'] = []
@@ -567,16 +565,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str):
     width_pt = mediabox[2] - mediabox[0]
     height_pt = mediabox[3] - mediabox[1]
 
-    if xmltext is not None:
-        bboxes = ghosttext.page_get_textblocks(
-            fspath(infile), pageno, xmltext=xmltext, height=height_pt
-        )
-        pageinfo['bboxes'] = bboxes
-    else:
-        pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
-        miner = get_page_analysis(infile, pageno, pscript5_mode)
-        pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
-        bboxes = (box.bbox for box in pageinfo['textboxes'])
+    pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
+    miner = get_page_analysis(infile, pageno, pscript5_mode)
+    pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
+    bboxes = (box.bbox for box in pageinfo['textboxes'])
 
     pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)
 
@@ -624,12 +616,12 @@ def _pdf_pageinfo_sync_init(infile):
 
 def _pdf_pageinfo_sync(args):
     global worker_pdf  # pylint: disable=global-statement
-    pageno, infile, xmltext, detailed_analysis = args
-    page = PageInfo(worker_pdf, pageno, infile, xmltext, detailed_analysis)
+    pageno, infile = args
+    page = PageInfo(worker_pdf, pageno, infile)
     return page
 
 
-def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar):
+def _pdf_pageinfo_concurrent(pdf, infile, progbar):
     pages = [None] * len(pdf.pages)
 
     def update_pageinfo(result, pbar):
@@ -637,11 +629,7 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar)
         pages[page.pageno] = page
         pbar.update()
 
-    contexts = (
-        (n, infile, pages_xml[n] if pages_xml else None, detailed_analysis)
-        for n in range(len(pdf.pages))
-    )
-
+    contexts = ((n, infile) for n in range(len(pdf.pages)))
     if os.name == 'nt':
         # We can't parallelize on Windows, because Windows cannot fork.
         # We are trying to fork, then take advantage of the preloaded pikepdf.Pdf
@@ -668,19 +656,12 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar)
     return pages
 
 
-def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False):
+def _pdf_get_all_pageinfo(infile, progbar=False):
     pdf = pikepdf.open(infile)  # Do not close in this function
     try:
         if pdf.is_encrypted:
             raise EncryptedPdfError()  # Triggered by encryption with empty passwd
-        if detailed_analysis:
-            pages_xml = None
-        else:
-            pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None)
-
-        pages = _pdf_pageinfo_concurrent(
-            pdf, infile, pages_xml, detailed_analysis, progbar
-        )
+        pages = _pdf_pageinfo_concurrent(pdf, infile, progbar)
     except Exception:
         pdf.close()
         raise
@@ -689,11 +670,10 @@ def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False):
 
 
 class PageInfo:
-    def __init__(self, pdf, pageno, infile, xmltext, detailed_analysis=False):
+    def __init__(self, pdf, pageno, infile):
         self._pageno = pageno
         self._infile = infile
-        self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, xmltext)
-        self._detailed_analysis = detailed_analysis
+        self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile)
 
     @property
     def pageno(self):
@@ -705,8 +685,6 @@ class PageInfo:
 
     @property
     def has_corrupt_text(self):
-        if not self._detailed_analysis:
-            raise NotImplementedError('Did not do detailed analysis')
         return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes'])
 
     @property
@@ -757,7 +735,7 @@ class PageInfo:
 
         if 'textboxes' not in self._pageinfo:
             if visible is not None and corrupt is not None:
-                raise NotImplementedError('Ghostscript textboxes cannot be classified')
+                raise NotImplementedError('Incomplete information on textboxes')
             return self._pageinfo['bboxes']
 
         return (
@@ -792,13 +770,9 @@ class PageInfo:
 class PdfInfo:
     """Get summary information about a PDF"""
 
-    def __init__(self, infile, detailed_page_analysis=False, progbar=False):
+    def __init__(self, infile, progbar=False):
         self._infile = infile
-        if ghostscript.version() in ('9.52',):
-            detailed_page_analysis = True  # txtwrite doesn't work in these versions
-        self._pages, pdf = _pdf_get_all_pageinfo(
-            infile, detailed_page_analysis, progbar=progbar
-        )
+        self._pages, pdf = _pdf_get_all_pageinfo(infile, progbar=progbar)
         self._needs_rendering = pdf.root.get('/NeedsRendering', False)
         self._has_acroform = False
         if '/AcroForm' in pdf.root:
diff --git a/tests/cache/manifest.jsonl b/tests/cache/manifest.jsonl
index 05a0e86c..23e50166 100644
--- a/tests/cache/manifest.jsonl
+++ b/tests/cache/manifest.jsonl
@@ -69,3 +69,4 @@
 {"tesseract_version": "tesseract 4.1.0  leptonica-1.78.0   libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1  Found AVX2  Found AVX  Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
 {"tesseract_version": "tesseract 4.1.0  leptonica-1.78.0   libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1  Found AVX2  Found AVX  Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
 {"tesseract_version": "tesseract 4.1.0  leptonica-1.78.0   libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1  Found AVX2  Found AVX  Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
+{"tesseract_version": "tesseract 4.1.1  leptonica-1.79.0   libgif 5.2.1 : libjpeg 9d : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.1.0 : libopenjp2 2.3.1  Found AVX2  Found AVX  Found FMA  Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin
new file mode 100644
index 0000000000000000000000000000000000000000..27e769f68d186d9b136b8d6835bafd0400cf1625
GIT binary patch
literal 1903
zcmb7FTW^~%6n^ioIOBz<V9cErn6!2`X<Mo()3%2xGT;(6Zo_Vq?$__wBrVA*Z6y*M
z4E`?1A5QXO*EVp=M^X1(YEa8IfOkb-)?JkvpI5IJk?|rsOP*aWFIL~KUcpA<`&XY|
zzJGTC#)6P9?gb&2t4sL$cJ<*sAONgJ)`?moAZrMD_0c#3JdBN$!*fEm+pV>At$wV?
z>J!;1JgNZp<wDM8Tcs=wWY?r9v=iwgFf@+iX|@6Q(({HX?YuMKzRJ!Z*>G0k6%tv?
zhGz+xu4<&^lA$v>EcmaR`ZhH#`cCprE>?E~H=vmAQbY2cBudPAv0<a&GQE-4*dd~E
zHA=@?o|=6zF2tRXyfyD<o01E`M^;GiENW2`9-UUOf#r3B{o`Ub84=e5t2n5lU?XR`
z8rZ0>P2K57yY;poOEomCx2XHtR3r)6n@{q-ywhUK`WA|Y38BSmXcOu(H5JJk#p&*(
zX5A`MgWR%_T=z|hNg<9Pp|l;Rt$x@WzZk~+mtD%7{uHf^N7a!PDf-r`YPq?|Efmw<
z4dt~JIKl8E_Z_0%vYj<Pu!gQO&LPkH9oYDz4nCRA=P)QOT@SXYBtvqdS$AO)oi}~)
zb4?#~8yb<?z7KH}fFDH>(Aak{O)8AZaBO}&cpoPa9UPN))u9jnz=K2#tea!11(`rq
z^{BL9Ce8Ii>;e;9FXY0uY@5O&w9jF;7WHoqZV+0&x+}RTc&F9@!e|;|4<oka?=7Oq
z6HQY#CQ&LSb5Z7@UnnZ3;KY%Ks}Q0noTYLEl$zPM<#M_Eif4P75QRQ<^S`K^Ln|jL
zm%5Jj@7azERQo?r#Yi>OJKl&Vv$)4B9t|)}s+XSUg(r^Uz|5F@GKF_S;fGVte22!T
z46d2ISw6j2{xOB`2J;v@q5H5bv14YJSMQEo9N(OzVEtDz?^k9Pa_UDRxSsFmuVxla
z8&j$2_o)p2%A^{YC-NzI433TK@0a>9s&HcrWE92^&fU<=`sMc{v;UL%@Ps3v7>pCU
mha?8r-IiDEku=VE6EaLY6u~V@Rmr|c$h6_&28{n2+1Vd+pFgSq

literal 0
HcmV?d00001

diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin
new file mode 100644
index 0000000000000000000000000000000000000000..16b617e5fc5dc7bd2ed182f08ade8647ae1cf593
GIT binary patch
literal 55
zcmWGZEiO(iN=z<M@GnTsQwYv4ElN&R@OKVUaLr54%u7`$Gto2DGgK(gEXh#tNi8VJ
K&&y0s<N^Q}oD!J;

literal 0
HcmV?d00001

diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stdout.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stdout.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/txt.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/txt.bin
new file mode 100644
index 0000000000000000000000000000000000000000..21e1e99537e6a5922ac5a865ffee1fd8ce9380d0
GIT binary patch
literal 46
xcma#__xG=ei1hb2@K-3vtVqo%R!A&SaQF9Da0&3_%J+{|$W2X8gvfC5003Tn4#NNd

literal 0
HcmV?d00001

diff --git a/tests/test_main.py b/tests/test_main.py
index d4146ccf..9b0cd17e 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -101,10 +101,10 @@ def test_skip_ocr(spoof_tesseract_cache, resources, outpdf):
 
 def test_redo_ocr(resources, outpdf):
     in_ = resources / 'graph_ocred.pdf'
-    before = PdfInfo(in_, detailed_page_analysis=True)
+    before = PdfInfo(in_)
     out = outpdf
     out = check_ocrmypdf(in_, out, '--redo-ocr')
-    after = PdfInfo(out, detailed_page_analysis=True)
+    after = PdfInfo(out)
     assert before[0].has_text and after[0].has_text
     assert (
         before[0].get_textareas() != after[0].get_textareas()
diff --git a/tests/test_pdfinfo.py b/tests/test_pdfinfo.py
index 13fb8a8b..cfa90d94 100644
--- a/tests/test_pdfinfo.py
+++ b/tests/test_pdfinfo.py
@@ -151,22 +151,6 @@ def test_pickle(resources):
     pickle.dumps(pdf)
 
 
-def test_regex():
-    rx = pdfinfo.ghosttext.regex_remove_char_tags
-
-    must_match = [
-        b'<char bbox="0 108 0 108" c="/"/>',
-        b'<char bbox="0 108 0 108" c=">"/>',
-        b'<char bbox="0 108 0 108" c="X"/>',
-    ]
-    must_not_match = [b'<span stuff="c">', b'<span>', b'</span>', b'</page>']
-
-    for s in must_match:
-        assert rx.match(s)
-    for s in must_not_match:
-        assert not rx.match(s)
-
-
 def test_vector(resources):
     filename = resources / 'vector.pdf'
     pdf = pdfinfo.PdfInfo(filename)
@@ -184,16 +168,9 @@ def test_ocr_detection(resources):
 @pytest.mark.parametrize(
     'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf')
 )
-@pytest.mark.xfail(
-    ghostscript.version() in ('9.52',), reason="gs 9.52 txtwrite doesn't work"
-)
 def test_corrupt_font_detection(resources, testfile):
     filename = resources / testfile
-    with pytest.raises(NotImplementedError):
-        pdf = pdfinfo.PdfInfo(filename)
-        pdf[0].has_corrupt_text
-
-    pdf = pdfinfo.PdfInfo(filename, detailed_page_analysis=True)
+    pdf = pdfinfo.PdfInfo(filename)
     assert pdf[0].has_corrupt_text