From 5ba56adb5338db331d1356ddd69759a4e791871e Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 24 Oct 2020 02:45:21 -0700 Subject: [PATCH] Fix page rotation issue (again) Commit 1327ab3 introduced a fix for a regression, which was reported in #581, #634. It appears that the actual cause of this issue was default parameters to rasterize_pdf_page in pluggy not working as expected, causing a default rotation=0 even when a rotation was needed. As such the OCR image was generated with the wrong orientation, causing the initial regression and fix in commit 1327ab3. Now that the real problem is identified, it's apparent that the logic prior to 1327ab3 was found and we can revert to 1327ab3 since it fixes all known cases including #658. This reverts 1327ab3 except for retaining improves to rotation output. --- src/ocrmypdf/_graft.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/ocrmypdf/_graft.py b/src/ocrmypdf/_graft.py index b5c6928f..33d18e13 100644 --- a/src/ocrmypdf/_graft.py +++ b/src/ocrmypdf/_graft.py @@ -109,6 +109,7 @@ class OcrGrafter: if textpdf and not self.font: self.font, self.font_key = self._find_font(textpdf) + emplaced_page = False content_rotation = self.pdfinfo[pageno].rotation path_image = Path(image).resolve() if image else None if path_image is not None and path_image != self.path_base: @@ -122,24 +123,21 @@ class OcrGrafter: local_image_page = self.pdf_base.pages[-1] self.pdf_base.pages[pageno].emplace(local_image_page) del self.pdf_base.pages[-1] - # The pdf_image_page will always be created with any /Rotate applied - # applied already - content_rotation = 0 + emplaced_page = True - if content_rotation != 0: - # Text can be misaligned on a /Rotate'd page. - # That is because we rasterize pages with /Rotate applied, - # so that the OCR image text is upright and comes back upright. - text_misaligned = (autorotate_correction - content_rotation) % 360 - log.debug( - f"Text rotation: (autorotate, content) -> text misalignment = " - f"({autorotate_correction}, {content_rotation}) -> {text_misaligned}" - ) - else: - text_misaligned = 0 + # Calculate if the text is misaligned compared to the content + if emplaced_page: + content_rotation = autorotate_correction + text_rotation = autorotate_correction + text_misaligned = (text_rotation - content_rotation) % 360 + log.debug( + f"Text rotation: (text, autorotate, content) -> text misalignment = " + f"({text_rotation}, {autorotate_correction}, {content_rotation}) -> {text_misaligned}" + ) if textpdf and self.font: - # Graft the text layer onto this page, whether new or old + # Graft the text layer onto this page, whether new or old, possibly + # rotating the text layer by the amount is misaligned. strip_old = self.context.options.redo_ocr self._graft_text_layer( page_num=pageno + 1, @@ -151,14 +149,14 @@ class OcrGrafter: strip_old_text=strip_old, ) - # Correct the page rotation + # Correct the overall page rotation if needed, now that the text and content + # are aligned page_rotation = (content_rotation - autorotate_correction) % 360 self.pdf_base.pages[pageno].Rotate = page_rotation log.debug( f"Page rotation: (content, auto) -> page = " f"({content_rotation}, {autorotate_correction}) -> {page_rotation}" ) - if self.emplacements % MAX_REPLACE_PAGES == 0: self.save_and_reload()