From e760622a5c4ea69cb90e9e147daaa62e4ed03f8a Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 7 May 2020 02:03:42 -0700 Subject: [PATCH] graft: refactor --- src/ocrmypdf/_graft.py | 198 +++++++++++++++++++++-------------------- src/ocrmypdf/_sync.py | 7 +- 2 files changed, 108 insertions(+), 97 deletions(-) diff --git a/src/ocrmypdf/_graft.py b/src/ocrmypdf/_graft.py index 667067b2..768b94ca 100644 --- a/src/ocrmypdf/_graft.py +++ b/src/ocrmypdf/_graft.py @@ -89,94 +89,6 @@ def strip_invisible_text(pdf, page): page.Contents = pikepdf.Stream(pdf, content_stream) -def _graft_text_layer( - *, pdf_base, page_num, text, font, font_key, procset, rotation, strip_old_text -): - """Insert the text layer from text page 0 on to pdf_base at page_num""" - - log.debug("Grafting") - if Path(text).stat().st_size == 0: - return - - # This is a pointer indicating a specific page in the base file - pdf_text = pikepdf.open(text) - pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() - - base_page = pdf_base.pages.p(page_num) - - # The text page always will be oriented up by this stage but the original - # content may have a rotation applied. Wrap the text stream with a rotation - # so it will be oriented the same way as the rest of the page content. - # (Previous versions OCRmyPDF rotated the content layer to match the text.) - mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] - wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] - - mediabox = [float(base_page.MediaBox[v]) for v in range(4)] - wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] - - translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) - untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) - corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1]) - # -rotation because the input is a clockwise angle and this formula - # uses CCW - rotation = -rotation % 360 - rotate = pikepdf.PdfMatrix().rotated(rotation) - - # Because of rounding of DPI, we might get a text layer that is not - # identically sized to the target page. Scale to adjust. Normally this - # is within 0.998. - if rotation in (90, 270): - wt, ht = ht, wt - scale_x = wp / wt - scale_y = hp / ht - - # log.debug('%r', scale_x, scale_y) - scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) - - # Translate the text so it is centered at (0, 0), rotate it there, adjust - # for a size different between initial and text PDF, then untranslate, and - # finally move the lower left corner to match the mediabox - ctm = translate @ rotate @ scale @ untranslate @ corner - - pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n' - - new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents) - - if strip_old_text: - strip_invisible_text(pdf_base, base_page) - - base_page.page_contents_add(new_text_layer, prepend=True) - - _update_page_resources( - page=base_page, font=font, font_key=font_key, procset=procset - ) - pdf_text.close() - - -def _find_font(text, pdf_base): - """Copy a font from the filename text into pdf_base""" - - font, font_key = None, None - possible_font_names = ('/f-0-0', '/F1') - try: - with pikepdf.open(text) as pdf_text: - try: - pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {}) - except (AttributeError, IndexError, KeyError): - return None, None - for f in possible_font_names: - pdf_text_font = pdf_text_fonts.get(f, None) - if pdf_text_font is not None: - font_key = f - break - if pdf_text_font: - font = pdf_base.copy_foreign(pdf_text_font) - return font, font_key - except (FileNotFoundError, pikepdf.PdfError): - # PdfError occurs if a 0-length file is written e.g. due to OCR timeout - return None, None - - class OcrGrafter: def __init__(self, context): self.context = context @@ -195,10 +107,11 @@ class OcrGrafter: self.emplacements = 1 self.interim_count = 0 - def graft_page(self, page_result): - pageno, image, text, _sidecar, autorotate_correction = page_result - if text and not self.font: - self.font, self.font_key = _find_font(text, self.pdf_base) + def graft_page( + self, *, pageno: int, image: Path, textpdf: Path, autorotate_correction: int + ): + if textpdf and not self.font: + self.font, self.font_key = self._find_font(textpdf) emplaced_page = False content_rotation = self.pdfinfo[pageno].rotation @@ -226,13 +139,12 @@ class OcrGrafter: f"{text_misaligned}, {content_rotation}" ) - if text and self.font: + if textpdf and self.font: # Graft the text layer onto this page, whether new or old strip_old = self.context.options.redo_ocr - _graft_text_layer( - pdf_base=self.pdf_base, + self._graft_text_layer( page_num=pageno + 1, - text=text, + textpdf=textpdf, font=self.font, font_key=self.font_key, rotation=text_misaligned, @@ -283,3 +195,97 @@ class OcrGrafter: self.pdf_base.save(self.output_file) self.pdf_base.close() return self.output_file + + def _find_font(self, text): + """Copy a font from the filename text into pdf_base""" + + font, font_key = None, None + possible_font_names = ('/f-0-0', '/F1') + try: + with pikepdf.open(text) as pdf_text: + try: + pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {}) + except (AttributeError, IndexError, KeyError): + return None, None + for f in possible_font_names: + pdf_text_font = pdf_text_fonts.get(f, None) + if pdf_text_font is not None: + font_key = f + break + if pdf_text_font: + font = self.pdf_base.copy_foreign(pdf_text_font) + return font, font_key + except (FileNotFoundError, pikepdf.PdfError): + # PdfError occurs if a 0-length file is written e.g. due to OCR timeout + return None, None + + def _graft_text_layer( + self, + *, + page_num: int, + textpdf: Path, + font: pikepdf.Object, + font_key: pikepdf.Object, + procset: pikepdf.Object, + rotation: int, + strip_old_text: bool, + ): + """Insert the text layer from text page 0 on to pdf_base at page_num""" + + log.debug("Grafting") + if Path(textpdf).stat().st_size == 0: + return + + # This is a pointer indicating a specific page in the base file + pdf_text = pikepdf.open(textpdf) + pdf_text_contents = pdf_text.pages[0].Contents.read_bytes() + + base_page = self.pdf_base.pages.p(page_num) + + # The text page always will be oriented up by this stage but the original + # content may have a rotation applied. Wrap the text stream with a rotation + # so it will be oriented the same way as the rest of the page content. + # (Previous versions OCRmyPDF rotated the content layer to match the text.) + mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)] + wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] + + mediabox = [float(base_page.MediaBox[v]) for v in range(4)] + wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1] + + translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2) + untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2) + corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1]) + # -rotation because the input is a clockwise angle and this formula + # uses CCW + rotation = -rotation % 360 + rotate = pikepdf.PdfMatrix().rotated(rotation) + + # Because of rounding of DPI, we might get a text layer that is not + # identically sized to the target page. Scale to adjust. Normally this + # is within 0.998. + if rotation in (90, 270): + wt, ht = ht, wt + scale_x = wp / wt + scale_y = hp / ht + + # log.debug('%r', scale_x, scale_y) + scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y) + + # Translate the text so it is centered at (0, 0), rotate it there, adjust + # for a size different between initial and text PDF, then untranslate, and + # finally move the lower left corner to match the mediabox + ctm = translate @ rotate @ scale @ untranslate @ corner + + pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n' + + new_text_layer = pikepdf.Stream(self.pdf_base, pdf_text_contents) + + if strip_old_text: + strip_invisible_text(self.pdf_base, base_page) + + base_page.page_contents_add(new_text_layer, prepend=True) + + _update_page_resources( + page=base_page, font=font, font_key=font_key, procset=procset + ) + pdf_text.close() diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index fc5a4ea4..710bda5c 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -243,7 +243,12 @@ def exec_concurrent(context): def update_page(result, pbar): sidecars[result.pageno] = result.text pbar.update() - ocrgraft.graft_page(result) + ocrgraft.graft_page( + pageno=result.pageno, + image=result.pdf_page_from_image, + textpdf=result.ocr, + autorotate_correction=result.orientation_correction, + ) pbar.update() exec_progress_pool(