Remove Ghostscript-based text extraction

While faster than Python based methods, we've outgrown the limited
amount of information Ghostscript provides with this feature, and it
repeats an analysis we have to do anyway to learn what images are
present.
This commit is contained in:
James R. Barlow
2020-04-11 16:03:00 -07:00
parent 2c07515907
commit 991db17fde
12 changed files with 23 additions and 228 deletions

View File

@@ -147,11 +147,9 @@ def triage(original_filename, input_file, output_file, options):
return output_file
def get_pdfinfo(input_file, detailed_page_analysis=False, progbar=False):
def get_pdfinfo(input_file, progbar=False):
try:
return PdfInfo(
input_file, detailed_page_analysis=detailed_page_analysis, progbar=progbar
)
return PdfInfo(input_file, progbar=progbar)
except pikepdf.PasswordError:
raise EncryptedPdfError()
except pikepdf.PdfError:

View File

@@ -326,11 +326,7 @@ def run_pipeline(options, api=False):
)
# Gather pdfinfo and create context
pdfinfo = get_pdfinfo(
origin_pdf,
detailed_page_analysis=options.redo_ocr,
progbar=options.progress_bar,
)
pdfinfo = get_pdfinfo(origin_pdf, progbar=options.progress_bar)
context = PDFContext(options, work_folder, origin_pdf, pdfinfo)

View File

@@ -83,55 +83,6 @@ def _gs_error_reported(stream) -> bool:
return re.search(r'error', stream, flags=re.IGNORECASE)
def extract_text(input_file, pageno=1):
"""Use the txtwrite device to get text layout information out
For details on options of -dTextFormat see
https://www.ghostscript.com/doc/current/VectorDevices.htm#TXT
Format is like
<page>
<line>
<span bbox="left top right bottom" font="..." size="...">
<char bbox="...." c="X"/>
:param pageno: number of page to extract, or all pages if None
:return: XML-ish text representation in bytes
"""
if pageno is not None:
pages = ['-dFirstPage=%i' % pageno, '-dLastPage=%i' % pageno]
else:
pages = []
# Note due to bug https://bugs.ghostscript.com/show_bug.cgi?id=701971
# Ghostscript <= 9.50 will truncate output unless we write to stdout, so
# don't write to a file.
args_gs = (
[
GS,
'-dQUIET',
'-dSAFER',
'-dBATCH',
'-dNOPAUSE',
'-sDEVICE=txtwrite',
'-dTextFormat=0',
]
+ pages
+ ['-o', '-', fspath(input_file), "-sstdout=%stderr"]
)
try:
p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
except CalledProcessError as e:
raise SubprocessOutputError(
'Ghostscript text extraction failed\n%s\n%s'
% (input_file, e.stderr.decode(errors='replace'))
)
return p.stdout
def rasterize_pdf(
input_file: os.PathLike,
output_file: os.PathLike,

View File

@@ -1,102 +0,0 @@
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import logging
import re
import xml.etree.ElementTree as ET
from ..exec import ghostscript
log = logging.getLogger(__name__)
# Forgive me for I have sinned
# I am using regular expressions to parse XML. However the XML in this case,
# generated by Ghostscript, is self-consistent enough to be parseable.
regex_remove_char_tags = re.compile(
br"""
<char\b
(?: [^>] # anything single character but >
| \">\" # special case: trap ">"
)*
/> # terminate with '/>'
""",
re.VERBOSE,
)
def page_get_textblocks(infile, pageno, xmltext, height):
"""Get text boxes out of Ghostscript txtwrite xml"""
root = xmltext
if not hasattr(xmltext, 'findall'):
return []
def blocks():
for span in root.findall('.//span'):
bbox_str = span.attrib['bbox']
font_size = span.attrib['size']
pts = [int(pt) for pt in bbox_str.split()]
pts[1] = pts[1] - int(float(font_size) + 0.5)
bbox_topdown = tuple(pts)
bb = bbox_topdown
bbox_bottomup = (bb[0], height - bb[3], bb[2], height - bb[1])
yield bbox_bottomup
def joined_blocks():
prev = None
for bbox in blocks():
if prev is None:
prev = bbox
if bbox[1] == prev[1] and bbox[3] == prev[3]:
gap = prev[2] - bbox[0]
height = abs(bbox[3] - bbox[1])
if gap < height:
# Join boxes
prev = (prev[0], prev[1], bbox[2], bbox[3])
continue
# yield previously joined bboxes and start anew
yield prev
prev = bbox
if prev is not None:
yield prev
return [block for block in joined_blocks()]
def extract_text_xml(infile, pdf, pageno=None):
existing_text = ghostscript.extract_text(infile, pageno=None)
existing_text = regex_remove_char_tags.sub(b' ', existing_text)
try:
root = ET.fromstringlist([b'<document>\n', existing_text, b'</document>\n'])
page_xml = root.findall('page')
except ET.ParseError as e:
log.error(
"An error occurred while attempting to retrieve existing text in "
"the input file. Will attempt to continue assuming that there is "
"no existing text in the file. The error was:"
)
log.error(e)
page_xml = [None] * len(pdf.pages)
page_count_difference = len(pdf.pages) - len(page_xml)
if page_count_difference != 0:
log.error("The number of pages in the input file is inconsistent.")
log.error(f"Expected {len(pdf.pages)}, txtwrite says {len(page_xml)}")
if page_count_difference > 0:
page_xml.extend([None] * page_count_difference)
return page_xml

View File

@@ -33,9 +33,7 @@ from pikepdf import PdfMatrix
from ocrmypdf._concurrent import exec_progress_pool
from ocrmypdf.exceptions import EncryptedPdfError
from ocrmypdf.exec import ghostscript
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo import ghosttext
from ocrmypdf.pdfinfo.layout import get_page_analysis, get_text_boxes
logger = logging.getLogger()
@@ -557,7 +555,7 @@ def simplify_textboxes(miner, textbox_getter):
yield TextboxInfo(box.bbox, visible, corrupt)
def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str):
def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike):
pageinfo = {}
pageinfo['pageno'] = pageno
pageinfo['images'] = []
@@ -567,16 +565,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str):
width_pt = mediabox[2] - mediabox[0]
height_pt = mediabox[3] - mediabox[1]
if xmltext is not None:
bboxes = ghosttext.page_get_textblocks(
fspath(infile), pageno, xmltext=xmltext, height=height_pt
)
pageinfo['bboxes'] = bboxes
else:
pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
miner = get_page_analysis(infile, pageno, pscript5_mode)
pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
bboxes = (box.bbox for box in pageinfo['textboxes'])
pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
miner = get_page_analysis(infile, pageno, pscript5_mode)
pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
bboxes = (box.bbox for box in pageinfo['textboxes'])
pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)
@@ -624,12 +616,12 @@ def _pdf_pageinfo_sync_init(infile):
def _pdf_pageinfo_sync(args):
global worker_pdf # pylint: disable=global-statement
pageno, infile, xmltext, detailed_analysis = args
page = PageInfo(worker_pdf, pageno, infile, xmltext, detailed_analysis)
pageno, infile = args
page = PageInfo(worker_pdf, pageno, infile)
return page
def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar):
def _pdf_pageinfo_concurrent(pdf, infile, progbar):
pages = [None] * len(pdf.pages)
def update_pageinfo(result, pbar):
@@ -637,11 +629,7 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar)
pages[page.pageno] = page
pbar.update()
contexts = (
(n, infile, pages_xml[n] if pages_xml else None, detailed_analysis)
for n in range(len(pdf.pages))
)
contexts = ((n, infile) for n in range(len(pdf.pages)))
if os.name == 'nt':
# We can't parallelize on Windows, because Windows cannot fork.
# We are trying to fork, then take advantage of the preloaded pikepdf.Pdf
@@ -668,19 +656,12 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar)
return pages
def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False):
def _pdf_get_all_pageinfo(infile, progbar=False):
pdf = pikepdf.open(infile) # Do not close in this function
try:
if pdf.is_encrypted:
raise EncryptedPdfError() # Triggered by encryption with empty passwd
if detailed_analysis:
pages_xml = None
else:
pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None)
pages = _pdf_pageinfo_concurrent(
pdf, infile, pages_xml, detailed_analysis, progbar
)
pages = _pdf_pageinfo_concurrent(pdf, infile, progbar)
except Exception:
pdf.close()
raise
@@ -689,11 +670,10 @@ def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False):
class PageInfo:
def __init__(self, pdf, pageno, infile, xmltext, detailed_analysis=False):
def __init__(self, pdf, pageno, infile):
self._pageno = pageno
self._infile = infile
self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, xmltext)
self._detailed_analysis = detailed_analysis
self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile)
@property
def pageno(self):
@@ -705,8 +685,6 @@ class PageInfo:
@property
def has_corrupt_text(self):
if not self._detailed_analysis:
raise NotImplementedError('Did not do detailed analysis')
return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes'])
@property
@@ -757,7 +735,7 @@ class PageInfo:
if 'textboxes' not in self._pageinfo:
if visible is not None and corrupt is not None:
raise NotImplementedError('Ghostscript textboxes cannot be classified')
raise NotImplementedError('Incomplete information on textboxes')
return self._pageinfo['bboxes']
return (
@@ -792,13 +770,9 @@ class PageInfo:
class PdfInfo:
"""Get summary information about a PDF"""
def __init__(self, infile, detailed_page_analysis=False, progbar=False):
def __init__(self, infile, progbar=False):
self._infile = infile
if ghostscript.version() in ('9.52',):
detailed_page_analysis = True # txtwrite doesn't work in these versions
self._pages, pdf = _pdf_get_all_pageinfo(
infile, detailed_page_analysis, progbar=progbar
)
self._pages, pdf = _pdf_get_all_pageinfo(infile, progbar=progbar)
self._needs_rendering = pdf.root.get('/NeedsRendering', False)
self._has_acroform = False
if '/AcroForm' in pdf.root:

View File

@@ -69,3 +69,4 @@
{"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "tesseract 4.1.1 leptonica-1.79.0 libgif 5.2.1 : libjpeg 9d : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.1.0 : libopenjp2 2.3.1 Found AVX2 Found AVX Found FMA Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

@@ -101,10 +101,10 @@ def test_skip_ocr(spoof_tesseract_cache, resources, outpdf):
def test_redo_ocr(resources, outpdf):
in_ = resources / 'graph_ocred.pdf'
before = PdfInfo(in_, detailed_page_analysis=True)
before = PdfInfo(in_)
out = outpdf
out = check_ocrmypdf(in_, out, '--redo-ocr')
after = PdfInfo(out, detailed_page_analysis=True)
after = PdfInfo(out)
assert before[0].has_text and after[0].has_text
assert (
before[0].get_textareas() != after[0].get_textareas()

View File

@@ -151,22 +151,6 @@ def test_pickle(resources):
pickle.dumps(pdf)
def test_regex():
rx = pdfinfo.ghosttext.regex_remove_char_tags
must_match = [
b'<char bbox="0 108 0 108" c="/"/>',
b'<char bbox="0 108 0 108" c=">"/>',
b'<char bbox="0 108 0 108" c="X"/>',
]
must_not_match = [b'<span stuff="c">', b'<span>', b'</span>', b'</page>']
for s in must_match:
assert rx.match(s)
for s in must_not_match:
assert not rx.match(s)
def test_vector(resources):
filename = resources / 'vector.pdf'
pdf = pdfinfo.PdfInfo(filename)
@@ -184,16 +168,9 @@ def test_ocr_detection(resources):
@pytest.mark.parametrize(
'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf')
)
@pytest.mark.xfail(
ghostscript.version() in ('9.52',), reason="gs 9.52 txtwrite doesn't work"
)
def test_corrupt_font_detection(resources, testfile):
filename = resources / testfile
with pytest.raises(NotImplementedError):
pdf = pdfinfo.PdfInfo(filename)
pdf[0].has_corrupt_text
pdf = pdfinfo.PdfInfo(filename, detailed_page_analysis=True)
pdf = pdfinfo.PdfInfo(filename)
assert pdf[0].has_corrupt_text