Fix page rotation issue (again)

Commit 1327ab3 introduced a fix for a regression, which was reported
in #581, #634. It appears that the actual cause of this issue was
default parameters to rasterize_pdf_page in pluggy not working as
expected, causing a default rotation=0 even when a rotation was needed.
As such the OCR image was generated with the wrong orientation,
causing the initial regression and fix in commit 1327ab3.

Now that the real problem is identified, it's apparent that the logic
prior to 1327ab3 was found and we can revert to 1327ab3 since it fixes
all known cases including #658.

This reverts 1327ab3 except for retaining improves to rotation output.
This commit is contained in:
James R. Barlow
2020-10-24 02:45:21 -07:00
parent ca735278e0
commit 5ba56adb53

View File

@@ -109,6 +109,7 @@ class OcrGrafter:
if textpdf and not self.font:
self.font, self.font_key = self._find_font(textpdf)
emplaced_page = False
content_rotation = self.pdfinfo[pageno].rotation
path_image = Path(image).resolve() if image else None
if path_image is not None and path_image != self.path_base:
@@ -122,24 +123,21 @@ class OcrGrafter:
local_image_page = self.pdf_base.pages[-1]
self.pdf_base.pages[pageno].emplace(local_image_page)
del self.pdf_base.pages[-1]
# The pdf_image_page will always be created with any /Rotate applied
# applied already
content_rotation = 0
emplaced_page = True
if content_rotation != 0:
# Text can be misaligned on a /Rotate'd page.
# That is because we rasterize pages with /Rotate applied,
# so that the OCR image text is upright and comes back upright.
text_misaligned = (autorotate_correction - content_rotation) % 360
log.debug(
f"Text rotation: (autorotate, content) -> text misalignment = "
f"({autorotate_correction}, {content_rotation}) -> {text_misaligned}"
)
else:
text_misaligned = 0
# Calculate if the text is misaligned compared to the content
if emplaced_page:
content_rotation = autorotate_correction
text_rotation = autorotate_correction
text_misaligned = (text_rotation - content_rotation) % 360
log.debug(
f"Text rotation: (text, autorotate, content) -> text misalignment = "
f"({text_rotation}, {autorotate_correction}, {content_rotation}) -> {text_misaligned}"
)
if textpdf and self.font:
# Graft the text layer onto this page, whether new or old
# Graft the text layer onto this page, whether new or old, possibly
# rotating the text layer by the amount is misaligned.
strip_old = self.context.options.redo_ocr
self._graft_text_layer(
page_num=pageno + 1,
@@ -151,14 +149,14 @@ class OcrGrafter:
strip_old_text=strip_old,
)
# Correct the page rotation
# Correct the overall page rotation if needed, now that the text and content
# are aligned
page_rotation = (content_rotation - autorotate_correction) % 360
self.pdf_base.pages[pageno].Rotate = page_rotation
log.debug(
f"Page rotation: (content, auto) -> page = "
f"({content_rotation}, {autorotate_correction}) -> {page_rotation}"
)
if self.emplacements % MAX_REPLACE_PAGES == 0:
self.save_and_reload()