From bb258fc99c3d3676977d708dded660d3802bc283 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 24 Dec 2020 01:47:53 -0800 Subject: [PATCH] pdfinfo: Refactor pageinfo dictionary into a class --- src/ocrmypdf/_pipeline.py | 2 +- src/ocrmypdf/pdfinfo/info.py | 97 +++++++++++++++++++++--------------- tests/test_main.py | 4 +- 3 files changed, 60 insertions(+), 43 deletions(-) diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index e0989f30..a546e802 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -150,7 +150,7 @@ def get_pdfinfo( progbar=False, max_workers=None, check_pages=None, -): +) -> PdfInfo: try: return PdfInfo( input_file, diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index c21aa746..207518ae 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -556,12 +556,27 @@ def simplify_textboxes(miner, textbox_getter): yield TextboxInfo(box.bbox, visible, corrupt) +class RawPageInfo: + def __init__(self, pageno): + self.pageno: int = pageno + self.images: List = [] + self.textboxes: List = [] + self.has_text: Optional[bool] = False + self.userunit: Decimal = Decimal(1.0) + self.width_inches: Optional[Decimal] = None + self.height_inches: Optional[Decimal] = None + self.rotate: int = 0 + self.has_vector = False + self.has_text = False + self.dpi: Optional[Decimal] = None + self.width_pixels: Optional[int] = None + self.height_pixels: Optional[int] = None + + def _pdf_get_pageinfo( pdf, pageno: int, infile: PathLike, check_pages, detailed_analysis: bool ): - pageinfo: Dict[str, Any] = {} - pageinfo['pageno'] = pageno - pageinfo['images'] = [] + pageinfo = RawPageInfo(pageno) page = pdf.pages[pageno] mediabox = [Decimal(d) for d in page.MediaBox.as_list()] @@ -573,53 +588,53 @@ def _pdf_get_pageinfo( if check_this_page and detailed_analysis: pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') miner = get_page_analysis(infile, pageno, pscript5_mode) - pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) - bboxes = (box.bbox for box in pageinfo['textboxes']) + pageinfo.textboxes = list(simplify_textboxes(miner, get_text_boxes)) + bboxes = (box.bbox for box in pageinfo.textboxes) - pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt) + pageinfo.has_text = _page_has_text(bboxes, width_pt, height_pt) else: - pageinfo['textboxes'] = [] - pageinfo['has_text'] = None # i.e. "no information" + pageinfo.textboxes = [] + pageinfo.has_text = None # i.e. "no information" userunit = page.get('/UserUnit', Decimal(1.0)) if not isinstance(userunit, Decimal): userunit = Decimal(userunit) - pageinfo['userunit'] = userunit - pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0) - pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0) + pageinfo.userunit = userunit + pageinfo.width_inches = width_pt * userunit / Decimal(72.0) + pageinfo.height_inches = height_pt * userunit / Decimal(72.0) try: - pageinfo['rotate'] = int(page['/Rotate']) + pageinfo.rotate = int(page['/Rotate']) except KeyError: - pageinfo['rotate'] = 0 + pageinfo.rotate = 0 userunit_shorthand = (userunit, 0, 0, userunit, 0, 0) if check_this_page: - pageinfo['has_vector'] = False - pageinfo['has_text'] = False - pageinfo['images'] = [] + pageinfo.has_vector = False + pageinfo.has_text = False + pageinfo.images = [] for ci in _process_content_streams( pdf=pdf, container=page, shorthand=userunit_shorthand ): if isinstance(ci, VectorMarker): - pageinfo['has_vector'] = True + pageinfo.has_vector = True elif isinstance(ci, TextMarker): - pageinfo['has_text'] = True + pageinfo.has_text = True elif isinstance(ci, ImageInfo): - pageinfo['images'].append(ci) + pageinfo.images.append(ci) else: raise NotImplementedError() else: - pageinfo['has_vector'] = None # i.e. "no information" - pageinfo['has_text'] = None - pageinfo['images'] = None + pageinfo.has_vector = None # i.e. "no information" + pageinfo.has_text = None + pageinfo.images = None - if pageinfo['images']: - dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in pageinfo['images']) - pageinfo['dpi'] = dpi - pageinfo['width_pixels'] = int(round(dpi.x * float(pageinfo['width_inches']))) - pageinfo['height_pixels'] = int(round(dpi.y * float(pageinfo['height_inches']))) + if pageinfo.images: + dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in pageinfo.images) + pageinfo.dpi = dpi + pageinfo.width_pixels = int(round(dpi.x * float(pageinfo.width_inches))) + pageinfo.height_pixels = int(round(dpi.y * float(pageinfo.height_inches))) return pageinfo @@ -707,25 +722,25 @@ class PageInfo: @property def has_text(self) -> bool: - return self._pageinfo['has_text'] + return self._pageinfo.has_text @property def has_corrupt_text(self) -> bool: if not self._detailed_analysis: raise NotImplementedError('Did not do detailed analysis') - return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes']) + return any(tbox.is_corrupt for tbox in self._pageinfo.textboxes) @property def has_vector(self) -> bool: - return self._pageinfo['has_vector'] + return self._pageinfo.has_vector @property def width_inches(self) -> Decimal: - return self._pageinfo['width_inches'] + return self._pageinfo.width_inches @property def height_inches(self) -> Decimal: - return self._pageinfo['height_inches'] + return self._pageinfo.height_inches @property def width_pixels(self) -> int: @@ -737,18 +752,18 @@ class PageInfo: @property def rotation(self) -> int: - return self._pageinfo.get('rotate', None) + return self._pageinfo.rotate @rotation.setter def rotation(self, value): if value in (0, 90, 180, 270, 360, -90, -180, -270): - self._pageinfo['rotate'] = value + self._pageinfo.rotate = value else: raise ValueError("rotation must be a cardinal angle") @property def images(self): - return self._pageinfo['images'] + return self._pageinfo.images def get_textareas( self, visible: Optional[bool] = None, corrupt: Optional[bool] = None @@ -763,24 +778,26 @@ class PageInfo: result = False return result - if 'textboxes' not in self._pageinfo: + if not self._pageinfo.textboxes: if visible is not None and corrupt is not None: raise NotImplementedError('Incomplete information on textboxes') - return self._pageinfo['bboxes'] + return self._pageinfo.textboxes return ( obj.bbox - for obj in self._pageinfo['textboxes'] + for obj in self._pageinfo.textboxes if predicate(obj, visible, corrupt) ) @property def dpi(self) -> Resolution: - return self._pageinfo.get('dpi', Resolution(0.0, 0.0)) + if self._pageinfo.dpi is None: + return Resolution(0.0, 0.0) + return self._pageinfo.dpi @property def userunit(self) -> Decimal: - return self._pageinfo.get('userunit', None) + return self._pageinfo.userunit @property def min_version(self) -> str: diff --git a/tests/test_main.py b/tests/test_main.py index 47a9cb40..33d45fba 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -92,10 +92,10 @@ def test_skip_ocr(resources, outpdf): def test_redo_ocr(resources, outpdf): in_ = resources / 'graph_ocred.pdf' - before = PdfInfo(in_) + before = PdfInfo(in_, detailed_analysis=True) out = outpdf out = check_ocrmypdf(in_, out, '--redo-ocr') - after = PdfInfo(out) + after = PdfInfo(out, detailed_analysis=True) assert before[0].has_text and after[0].has_text assert ( before[0].get_textareas() != after[0].get_textareas()