diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index bb2f7e18..3be65faa 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -147,11 +147,9 @@ def triage(original_filename, input_file, output_file, options): return output_file -def get_pdfinfo(input_file, detailed_page_analysis=False, progbar=False): +def get_pdfinfo(input_file, progbar=False): try: - return PdfInfo( - input_file, detailed_page_analysis=detailed_page_analysis, progbar=progbar - ) + return PdfInfo(input_file, progbar=progbar) except pikepdf.PasswordError: raise EncryptedPdfError() except pikepdf.PdfError: diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index f54c32a6..9f5c65d8 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -326,11 +326,7 @@ def run_pipeline(options, api=False): ) # Gather pdfinfo and create context - pdfinfo = get_pdfinfo( - origin_pdf, - detailed_page_analysis=options.redo_ocr, - progbar=options.progress_bar, - ) + pdfinfo = get_pdfinfo(origin_pdf, progbar=options.progress_bar) context = PDFContext(options, work_folder, origin_pdf, pdfinfo) diff --git a/src/ocrmypdf/exec/ghostscript.py b/src/ocrmypdf/exec/ghostscript.py index 5c27488f..44b81682 100644 --- a/src/ocrmypdf/exec/ghostscript.py +++ b/src/ocrmypdf/exec/ghostscript.py @@ -83,55 +83,6 @@ def _gs_error_reported(stream) -> bool: return re.search(r'error', stream, flags=re.IGNORECASE) -def extract_text(input_file, pageno=1): - """Use the txtwrite device to get text layout information out - - For details on options of -dTextFormat see - https://www.ghostscript.com/doc/current/VectorDevices.htm#TXT - - Format is like - - - - - - :param pageno: number of page to extract, or all pages if None - :return: XML-ish text representation in bytes - """ - - if pageno is not None: - pages = ['-dFirstPage=%i' % pageno, '-dLastPage=%i' % pageno] - else: - pages = [] - - # Note due to bug https://bugs.ghostscript.com/show_bug.cgi?id=701971 - # Ghostscript <= 9.50 will truncate output unless we write to stdout, so - # don't write to a file. - args_gs = ( - [ - GS, - '-dQUIET', - '-dSAFER', - '-dBATCH', - '-dNOPAUSE', - '-sDEVICE=txtwrite', - '-dTextFormat=0', - ] - + pages - + ['-o', '-', fspath(input_file), "-sstdout=%stderr"] - ) - - try: - p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True) - except CalledProcessError as e: - raise SubprocessOutputError( - 'Ghostscript text extraction failed\n%s\n%s' - % (input_file, e.stderr.decode(errors='replace')) - ) - - return p.stdout - - def rasterize_pdf( input_file: os.PathLike, output_file: os.PathLike, diff --git a/src/ocrmypdf/pdfinfo/ghosttext.py b/src/ocrmypdf/pdfinfo/ghosttext.py deleted file mode 100644 index 07e72f19..00000000 --- a/src/ocrmypdf/pdfinfo/ghosttext.py +++ /dev/null @@ -1,102 +0,0 @@ -# © 2018 James R. Barlow: github.com/jbarlow83 -# -# This file is part of OCRmyPDF. -# -# OCRmyPDF is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# OCRmyPDF is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with OCRmyPDF. If not, see . - -import logging -import re -import xml.etree.ElementTree as ET - -from ..exec import ghostscript - -log = logging.getLogger(__name__) - -# Forgive me for I have sinned -# I am using regular expressions to parse XML. However the XML in this case, -# generated by Ghostscript, is self-consistent enough to be parseable. -regex_remove_char_tags = re.compile( - br""" - ] # anything single character but > - | \">\" # special case: trap ">" - )* - /> # terminate with '/>' -""", - re.VERBOSE, -) - - -def page_get_textblocks(infile, pageno, xmltext, height): - """Get text boxes out of Ghostscript txtwrite xml""" - - root = xmltext - if not hasattr(xmltext, 'findall'): - return [] - - def blocks(): - for span in root.findall('.//span'): - bbox_str = span.attrib['bbox'] - font_size = span.attrib['size'] - pts = [int(pt) for pt in bbox_str.split()] - pts[1] = pts[1] - int(float(font_size) + 0.5) - bbox_topdown = tuple(pts) - bb = bbox_topdown - bbox_bottomup = (bb[0], height - bb[3], bb[2], height - bb[1]) - yield bbox_bottomup - - def joined_blocks(): - prev = None - for bbox in blocks(): - if prev is None: - prev = bbox - if bbox[1] == prev[1] and bbox[3] == prev[3]: - gap = prev[2] - bbox[0] - height = abs(bbox[3] - bbox[1]) - if gap < height: - # Join boxes - prev = (prev[0], prev[1], bbox[2], bbox[3]) - continue - # yield previously joined bboxes and start anew - yield prev - prev = bbox - if prev is not None: - yield prev - - return [block for block in joined_blocks()] - - -def extract_text_xml(infile, pdf, pageno=None): - existing_text = ghostscript.extract_text(infile, pageno=None) - existing_text = regex_remove_char_tags.sub(b' ', existing_text) - - try: - root = ET.fromstringlist([b'\n', existing_text, b'\n']) - page_xml = root.findall('page') - except ET.ParseError as e: - log.error( - "An error occurred while attempting to retrieve existing text in " - "the input file. Will attempt to continue assuming that there is " - "no existing text in the file. The error was:" - ) - log.error(e) - page_xml = [None] * len(pdf.pages) - - page_count_difference = len(pdf.pages) - len(page_xml) - if page_count_difference != 0: - log.error("The number of pages in the input file is inconsistent.") - log.error(f"Expected {len(pdf.pages)}, txtwrite says {len(page_xml)}") - if page_count_difference > 0: - page_xml.extend([None] * page_count_difference) - return page_xml diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index bb232ee5..bd7641c4 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -33,9 +33,7 @@ from pikepdf import PdfMatrix from ocrmypdf._concurrent import exec_progress_pool from ocrmypdf.exceptions import EncryptedPdfError -from ocrmypdf.exec import ghostscript from ocrmypdf.helpers import Resolution -from ocrmypdf.pdfinfo import ghosttext from ocrmypdf.pdfinfo.layout import get_page_analysis, get_text_boxes logger = logging.getLogger() @@ -557,7 +555,7 @@ def simplify_textboxes(miner, textbox_getter): yield TextboxInfo(box.bbox, visible, corrupt) -def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str): +def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike): pageinfo = {} pageinfo['pageno'] = pageno pageinfo['images'] = [] @@ -567,16 +565,10 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str): width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] - if xmltext is not None: - bboxes = ghosttext.page_get_textblocks( - fspath(infile), pageno, xmltext=xmltext, height=height_pt - ) - pageinfo['bboxes'] = bboxes - else: - pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') - miner = get_page_analysis(infile, pageno, pscript5_mode) - pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) - bboxes = (box.bbox for box in pageinfo['textboxes']) + pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') + miner = get_page_analysis(infile, pageno, pscript5_mode) + pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) + bboxes = (box.bbox for box in pageinfo['textboxes']) pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt) @@ -624,12 +616,12 @@ def _pdf_pageinfo_sync_init(infile): def _pdf_pageinfo_sync(args): global worker_pdf # pylint: disable=global-statement - pageno, infile, xmltext, detailed_analysis = args - page = PageInfo(worker_pdf, pageno, infile, xmltext, detailed_analysis) + pageno, infile = args + page = PageInfo(worker_pdf, pageno, infile) return page -def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar): +def _pdf_pageinfo_concurrent(pdf, infile, progbar): pages = [None] * len(pdf.pages) def update_pageinfo(result, pbar): @@ -637,11 +629,7 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar) pages[page.pageno] = page pbar.update() - contexts = ( - (n, infile, pages_xml[n] if pages_xml else None, detailed_analysis) - for n in range(len(pdf.pages)) - ) - + contexts = ((n, infile) for n in range(len(pdf.pages))) if os.name == 'nt': # We can't parallelize on Windows, because Windows cannot fork. # We are trying to fork, then take advantage of the preloaded pikepdf.Pdf @@ -668,19 +656,12 @@ def _pdf_pageinfo_concurrent(pdf, infile, pages_xml, detailed_analysis, progbar) return pages -def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False): +def _pdf_get_all_pageinfo(infile, progbar=False): pdf = pikepdf.open(infile) # Do not close in this function try: if pdf.is_encrypted: raise EncryptedPdfError() # Triggered by encryption with empty passwd - if detailed_analysis: - pages_xml = None - else: - pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None) - - pages = _pdf_pageinfo_concurrent( - pdf, infile, pages_xml, detailed_analysis, progbar - ) + pages = _pdf_pageinfo_concurrent(pdf, infile, progbar) except Exception: pdf.close() raise @@ -689,11 +670,10 @@ def _pdf_get_all_pageinfo(infile, detailed_analysis=False, progbar=False): class PageInfo: - def __init__(self, pdf, pageno, infile, xmltext, detailed_analysis=False): + def __init__(self, pdf, pageno, infile): self._pageno = pageno self._infile = infile - self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, xmltext) - self._detailed_analysis = detailed_analysis + self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile) @property def pageno(self): @@ -705,8 +685,6 @@ class PageInfo: @property def has_corrupt_text(self): - if not self._detailed_analysis: - raise NotImplementedError('Did not do detailed analysis') return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes']) @property @@ -757,7 +735,7 @@ class PageInfo: if 'textboxes' not in self._pageinfo: if visible is not None and corrupt is not None: - raise NotImplementedError('Ghostscript textboxes cannot be classified') + raise NotImplementedError('Incomplete information on textboxes') return self._pageinfo['bboxes'] return ( @@ -792,13 +770,9 @@ class PageInfo: class PdfInfo: """Get summary information about a PDF""" - def __init__(self, infile, detailed_page_analysis=False, progbar=False): + def __init__(self, infile, progbar=False): self._infile = infile - if ghostscript.version() in ('9.52',): - detailed_page_analysis = True # txtwrite doesn't work in these versions - self._pages, pdf = _pdf_get_all_pageinfo( - infile, detailed_page_analysis, progbar=progbar - ) + self._pages, pdf = _pdf_get_all_pageinfo(infile, progbar=progbar) self._needs_rendering = pdf.root.get('/NeedsRendering', False) self._has_acroform = False if '/AcroForm' in pdf.root: diff --git a/tests/cache/manifest.jsonl b/tests/cache/manifest.jsonl index 05a0e86c..23e50166 100644 --- a/tests/cache/manifest.jsonl +++ b/tests/cache/manifest.jsonl @@ -69,3 +69,4 @@ {"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]} {"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]} {"tesseract_version": "tesseract 4.1.0 leptonica-1.78.0 libgif 5.1.4 : libjpeg 9c : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.0.3 : libopenjp2 2.3.1 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.5", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.1.1 leptonica-1.79.0 libgif 5.2.1 : libjpeg 9d : libpng 1.6.37 : libtiff 4.1.0 : zlib 1.2.11 : libwebp 1.1.0 : libopenjp2 2.3.1 Found AVX2 Found AVX Found FMA Found SSE ", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]} diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin new file mode 100644 index 00000000..27e769f6 Binary files /dev/null and b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/hocr.bin differ diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin new file mode 100644 index 00000000..16b617e5 Binary files /dev/null and b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stderr.bin differ diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stdout.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/stdout.bin new file mode 100644 index 00000000..e69de29b diff --git a/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/txt.bin b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/txt.bin new file mode 100644 index 00000000..21e1e995 Binary files /dev/null and b/tests/cache/multipage/__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt/txt.bin differ diff --git a/tests/test_main.py b/tests/test_main.py index d4146ccf..9b0cd17e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -101,10 +101,10 @@ def test_skip_ocr(spoof_tesseract_cache, resources, outpdf): def test_redo_ocr(resources, outpdf): in_ = resources / 'graph_ocred.pdf' - before = PdfInfo(in_, detailed_page_analysis=True) + before = PdfInfo(in_) out = outpdf out = check_ocrmypdf(in_, out, '--redo-ocr') - after = PdfInfo(out, detailed_page_analysis=True) + after = PdfInfo(out) assert before[0].has_text and after[0].has_text assert ( before[0].get_textareas() != after[0].get_textareas() diff --git a/tests/test_pdfinfo.py b/tests/test_pdfinfo.py index 13fb8a8b..cfa90d94 100644 --- a/tests/test_pdfinfo.py +++ b/tests/test_pdfinfo.py @@ -151,22 +151,6 @@ def test_pickle(resources): pickle.dumps(pdf) -def test_regex(): - rx = pdfinfo.ghosttext.regex_remove_char_tags - - must_match = [ - b'', - b'', - b'', - ] - must_not_match = [b'', b'', b'', b''] - - for s in must_match: - assert rx.match(s) - for s in must_not_match: - assert not rx.match(s) - - def test_vector(resources): filename = resources / 'vector.pdf' pdf = pdfinfo.PdfInfo(filename) @@ -184,16 +168,9 @@ def test_ocr_detection(resources): @pytest.mark.parametrize( 'testfile', ('truetype_font_nomapping.pdf', 'type3_font_nomapping.pdf') ) -@pytest.mark.xfail( - ghostscript.version() in ('9.52',), reason="gs 9.52 txtwrite doesn't work" -) def test_corrupt_font_detection(resources, testfile): filename = resources / testfile - with pytest.raises(NotImplementedError): - pdf = pdfinfo.PdfInfo(filename) - pdf[0].has_corrupt_text - - pdf = pdfinfo.PdfInfo(filename, detailed_page_analysis=True) + pdf = pdfinfo.PdfInfo(filename) assert pdf[0].has_corrupt_text