From cfd4f8a850aa86e3e6b37b1f45b17efc54fc2e9d Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 19 Sep 2018 20:29:18 -0700 Subject: [PATCH] Improve error handling for improvements to Ghostscript text extraction --- src/ocrmypdf/pdfinfo.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/ocrmypdf/pdfinfo.py b/src/ocrmypdf/pdfinfo.py index b653d9ce..2719441f 100644 --- a/src/ocrmypdf/pdfinfo.py +++ b/src/ocrmypdf/pdfinfo.py @@ -22,6 +22,7 @@ import re from collections import namedtuple from pathlib import Path from enum import Enum +from unittest.mock import Mock from .exec import ghostscript from .helpers import fspath @@ -596,7 +597,7 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext): return pageinfo -def _pdf_get_all_pageinfo(infile): +def _pdf_get_all_pageinfo(infile, log=Mock()): import xml.etree.ElementTree as ET pdf = pikepdf.open(infile) @@ -605,12 +606,24 @@ def _pdf_get_all_pageinfo(infile): existing_text = regex_remove_char_tags.sub(b' ', existing_text) try: - root = ET.fromstringlist([b'\n', existing_text, b'\n']) + root = ET.fromstringlist([ + b'\n', existing_text, b'\n' + ]) page_xml = root.findall('page') except ET.ParseError as e: - # Need to log here + log.error( + "An error occurred while attempting to retrieve existing text in " + "the input file. Will attempt to continue assuming that there is " + "no existing text in the file. The error was:") + log.error(e) page_xml = [None] * len(pdf.pages) + page_count_difference = len(pdf.pages) - len(page_xml) + if page_count_difference != 0: + log.error("The number of pages in the input file is inconsistent.") + if page_count_difference > 0: + page_xml.extend([None] * page_count_difference) + pages = [] for n in range(len(pdf.pages)): page = PageInfo(pdf, n, infile, page_xml[n]) @@ -700,9 +713,9 @@ class PdfInfo: """Get summary information about a PDF """ - def __init__(self, infile): + def __init__(self, infile, log=None): self._infile = infile - self._pages, pdf = _pdf_get_all_pageinfo(infile) + self._pages, pdf = _pdf_get_all_pageinfo(infile, log) self._needs_rendering = pdf.root.get('/NeedsRendering', False) @property