From e97f89de3beb6b993513cde011ccfa7ec6a2a3e3 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 2 Dec 2023 08:55:01 -0800 Subject: [PATCH] Refactor font so glyphless isn't as hard coded --- src/ocrmypdf/hocrtransform/_canvas.py | 204 ++++++++++++++------------ src/ocrmypdf/hocrtransform/_hocr.py | 35 +++-- 2 files changed, 135 insertions(+), 104 deletions(-) diff --git a/src/ocrmypdf/hocrtransform/_canvas.py b/src/ocrmypdf/hocrtransform/_canvas.py index 11a549ec..caeecae1 100644 --- a/src/ocrmypdf/hocrtransform/_canvas.py +++ b/src/ocrmypdf/hocrtransform/_canvas.py @@ -5,6 +5,7 @@ from __future__ import annotations import logging import unicodedata +import zlib from contextlib import contextmanager from dataclasses import dataclass from enum import Enum @@ -26,93 +27,119 @@ from .color import Color log = logging.getLogger(__name__) -GLYPHLESS_FONT_NAME = 'pdf.ttf' - -GLYPHLESS_FONT = (package_files('ocrmypdf.data') / GLYPHLESS_FONT_NAME).read_bytes() -CHAR_ASPECT = 2 - class TextDirection(Enum): - LTR = 1 - RTL = 2 + LTR = 1 # Left to right: the default + RTL = 2 # Right to left: Arabic, Hebrew, Persian -def register_glyphlessfont(pdf: Pdf): - """Register the glyphless font. +class Font: + def text_width(self, text: str, fontsize: float) -> int: + """Estimate the width of a text string when rendered with the given font.""" + raise NotImplementedError - Create several data structures in the Pdf to describe the font. While it create - the data, a reference should be set in at least one page's /Resources dictionary - to retain the font in the output PDF and ensure it is usable on that page. - """ - PLACEHOLDER = Name.Placeholder + def register(self, pdf: Pdf): + """Register the font. - basefont = pdf.make_indirect( - Dictionary( - BaseFont=Name.GlyphLessFont, - DescendantFonts=[PLACEHOLDER], - Encoding=Name("/Identity-H"), - Subtype=Name.Type0, - ToUnicode=PLACEHOLDER, - Type=Name.Font, + Create several data structures in the Pdf to describe the font. While it create + the data, a reference should be set in at least one page's /Resources dictionary + to retain the font in the output PDF and ensure it is usable on that page. + """ + raise NotImplementedError + + +class GlyphlessFont(Font): + CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536) + GLYPHLESS_FONT_NAME = 'pdf.ttf' + GLYPHLESS_FONT = (package_files('ocrmypdf.data') / GLYPHLESS_FONT_NAME).read_bytes() + CHAR_ASPECT = 2 + + def __init__(self): + pass + + def text_width(self, text: str, fontsize: float) -> int: + """Estimate the width of a text string when rendered with the given font.""" + # NFKC: split ligatures, combine diacritics + return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT) + + def register(self, pdf: Pdf): + """Register the glyphless font. + + Create several data structures in the Pdf to describe the font. While it create + the data, a reference should be set in at least one page's /Resources dictionary + to retain the font in the output PDF and ensure it is usable on that page. + """ + PLACEHOLDER = Name.Placeholder + + basefont = pdf.make_indirect( + Dictionary( + BaseFont=Name.GlyphLessFont, + DescendantFonts=[PLACEHOLDER], + Encoding=Name("/Identity-H"), + Subtype=Name.Type0, + ToUnicode=PLACEHOLDER, + Type=Name.Font, + ) ) - ) - cid_font_type2 = pdf.make_indirect( - Dictionary( - BaseFont=Name.GlyphLessFont, - CIDToGIDMap=PLACEHOLDER, - CIDSystemInfo=Dictionary( - Ordering="Identity", - Registry="Adobe", - Supplement=0, - ), - FontDescriptor=PLACEHOLDER, - Subtype=Name.CIDFontType2, - Type=Name.Font, - DW=1000 // CHAR_ASPECT, + cid_font_type2 = pdf.make_indirect( + Dictionary( + BaseFont=Name.GlyphLessFont, + CIDToGIDMap=PLACEHOLDER, + CIDSystemInfo=Dictionary( + Ordering="Identity", + Registry="Adobe", + Supplement=0, + ), + FontDescriptor=PLACEHOLDER, + Subtype=Name.CIDFontType2, + Type=Name.Font, + DW=1000 // self.CHAR_ASPECT, + ) ) - ) - basefont.DescendantFonts = [cid_font_type2] - cid_font_type2.CIDToGIDMap = pdf.make_stream(b"\x00\x01" * 65536) - basefont.ToUnicode = pdf.make_stream( - b"/CIDInit /ProcSet findresource begin\n" - b"12 dict begin\n" - b"begincmap\n" - b"/CIDSystemInfo\n" - b"<<\n" - b" /Registry (Adobe)\n" - b" /Ordering (UCS)\n" - b" /Supplement 0\n" - b">> def\n" - b"/CMapName /Adobe-Identify-UCS def\n" - b"/CMapType 2 def\n" - b"1 begincodespacerange\n" - b"<0000> \n" - b"endcodespacerange\n" - b"1 beginbfrange\n" - b"<0000> <0000>\n" - b"endbfrange\n" - b"endcmap\n" - b"CMapName currentdict /CMap defineresource pop\n" - b"end\n" - b"end\n" - ) - font_descriptor = pdf.make_indirect( - Dictionary( - Ascent=1000, - CapHeight=1000, - Descent=-1, - Flags=5, # Fixed pitch and symbolic - FontBBox=[0, 0, 1000 // CHAR_ASPECT, 1000], - FontFile2=PLACEHOLDER, - FontName=Name.GlyphLessFont, - ItalicAngle=0, - StemV=80, - Type=Name.FontDescriptor, + basefont.DescendantFonts = [cid_font_type2] + cid_font_type2.CIDToGIDMap = pdf.make_stream( + self.CID_TO_GID_DATA, Filter=Name.FlateDecode ) - ) - font_descriptor.FontFile2 = pdf.make_stream(GLYPHLESS_FONT) - cid_font_type2.FontDescriptor = font_descriptor - return basefont + basefont.ToUnicode = pdf.make_stream( + b"/CIDInit /ProcSet findresource begin\n" + b"12 dict begin\n" + b"begincmap\n" + b"/CIDSystemInfo\n" + b"<<\n" + b" /Registry (Adobe)\n" + b" /Ordering (UCS)\n" + b" /Supplement 0\n" + b">> def\n" + b"/CMapName /Adobe-Identify-UCS def\n" + b"/CMapType 2 def\n" + b"1 begincodespacerange\n" + b"<0000> \n" + b"endcodespacerange\n" + b"1 beginbfrange\n" + b"<0000> <0000>\n" + b"endbfrange\n" + b"endcmap\n" + b"CMapName currentdict /CMap defineresource pop\n" + b"end\n" + b"end\n" + ) + font_descriptor = pdf.make_indirect( + Dictionary( + Ascent=1000, + CapHeight=1000, + Descent=-1, + Flags=5, # Fixed pitch and symbolic + FontBBox=[0, 0, 1000 // self.CHAR_ASPECT, 1000], + FontFile2=PLACEHOLDER, + FontName=Name.GlyphLessFont, + ItalicAngle=0, + StemV=80, + Type=Name.FontDescriptor, + ) + ) + font_descriptor.FontFile2 = pdf.make_stream(self.GLYPHLESS_FONT) + cid_font_type2.FontDescriptor = font_descriptor + return basefont class ContentStreamBuilder: @@ -326,8 +353,7 @@ class _PikepdfCanvasAccessor: def draw_image(self, image: Path | str | Image.Image, x, y, width, height): """Draw image at (x,y) with width w and height h.""" - with self.save_state(): - self.cm(Matrix(width, 0, 0, height, x, y)) + with self.save_state(cm=Matrix(width, 0, 0, height, x, y)): if isinstance(image, (Path, str)): image = Image.open(image) image.load() @@ -395,23 +421,22 @@ class PikepdfCanvas: self.page_size = page_size self._pdf = Pdf.new() self._page = self._pdf.add_blank_page(page_size=page_size) + self._page.Resources = Dictionary(Font=Dictionary(), XObject=Dictionary()) self._cs = ContentStreamBuilder() self._images: list[LoadedImage] = [] self._accessor = _PikepdfCanvasAccessor(self._cs, self._images) self._stack_depth = 0 - self._font_name = Name("/f-0-0") self.do.push() + def add_font(self, resource_name: Name, font: Font): + """Add a font to the page.""" + self._page.Resources.Font[resource_name] = font.register(self._pdf) + @property def do(self) -> _PikepdfCanvasAccessor: """Do operations on the current graphics state.""" return self._accessor - def string_width(self, s: str, fontname, fontsize): - """Estimate the width of a text string when rendered with the given font.""" - # NFKC: split ligatures, combine diacritics - return len(unicodedata.normalize("NFKC", s)) * (fontsize / CHAR_ASPECT) - def _save_image(self, li: LoadedImage): return self._pdf.make_stream( li.image.tobytes(), @@ -434,9 +459,6 @@ class PikepdfCanvas: "rendering may be incorrect" ) self._page.Contents = self._pdf.make_stream(self._cs.build()) - self._page.MediaBox = [0, 0, *self.page_size] - self._page.Resources = Dictionary(Font=Dictionary(), XObject=Dictionary()) - self._page.Resources.Font[self._font_name] = register_glyphlessfont(self._pdf) for li in self._images: self._page.Resources.XObject[li.name] = self._save_image(li) self._pdf.save(output_file) @@ -450,8 +472,8 @@ class PikepdfText: self._cs.begin_text() self._direction = direction - def font(self, font, size): - self._cs.set_text_font(Name("/f-0-0"), size) + def font(self, font: Name, size: float): + self._cs.set_text_font(font, size) return self def render_mode(self, mode): diff --git a/src/ocrmypdf/hocrtransform/_hocr.py b/src/ocrmypdf/hocrtransform/_hocr.py index 2c1e7c44..b75e7b0c 100644 --- a/src/ocrmypdf/hocrtransform/_hocr.py +++ b/src/ocrmypdf/hocrtransform/_hocr.py @@ -17,10 +17,15 @@ from math import atan, cos, pi from pathlib import Path from xml.etree import ElementTree -from pikepdf import Matrix, Rectangle +from pikepdf import Matrix, Name, Rectangle +from ocrmypdf.hocrtransform._canvas import ( + Font, + GlyphlessFont, + PikepdfText, + TextDirection, +) from ocrmypdf.hocrtransform._canvas import PikepdfCanvas as Canvas -from ocrmypdf.hocrtransform._canvas import PikepdfText, TextDirection from ocrmypdf.hocrtransform.color import ( BLACK, BLUE, @@ -70,10 +75,20 @@ class HocrTransform: re.VERBOSE, ) - def __init__(self, *, hocr_filename: str | Path, dpi: float, debug: bool = False): + def __init__( + self, + *, + hocr_filename: str | Path, + dpi: float, + debug: bool = False, + fontname: Name = Name("/f-0-0"), + font: Font = GlyphlessFont(), + ): """Initialize the HocrTransform object.""" self.dpi = dpi self.hocr = ElementTree.parse(os.fspath(hocr_filename)) + self._fontname = fontname + self._font = font # if the hOCR file has a namespace, ElementTree requires its use to # find elements @@ -147,7 +162,6 @@ class HocrTransform: *, out_filename: Path, image_filename: Path | None = None, - fontname: str = "Helvetica", invisible_text: bool = True, ) -> None: """Creates a PDF file with an image superimposed on top of the text. @@ -162,7 +176,6 @@ class HocrTransform: out_filename: Path of PDF to write. image_filename: Image to use for this file. If omitted, the OCR text is shown. - fontname: Name of font to use. invisible_text: If True, text is rendered invisible so that is selectable but never drawn. If False, text is visible and may be seen if the image is skipped or deleted in Acrobat. @@ -170,6 +183,7 @@ class HocrTransform: # create the PDF file # page size in points (1/72 in.) canvas = Canvas(page_size=(self.width, self.height)) + canvas.add_font(self._fontname, self._font) page_matrix = ( Matrix() .translated(0, self.height) @@ -195,7 +209,6 @@ class HocrTransform: canvas, line, "ocrx_word", - fontname, invisible_text, direction, inject_word_breaks, @@ -208,7 +221,6 @@ class HocrTransform: canvas, root, "ocrx_word", - fontname, invisible_text, TextDirection.LTR, True, @@ -256,7 +268,6 @@ class HocrTransform: canvas: Canvas, line: Element | None, elemclass: str, - fontname: str, invisible_text: bool, text_direction: TextDirection, inject_word_breaks: bool, @@ -301,7 +312,7 @@ class HocrTransform: # on a sloped baseline and the edge of the bounding box. line_box_height = abs(line_box.height) / cos(angle) fontsize = line_box_height + intercept - text.font(fontname, fontsize) + text.font(self._fontname, fontsize) if invisible_text or True: text.render_mode(3) # Invisible (indicates OCR text) @@ -314,7 +325,6 @@ class HocrTransform: for elem, next_elem in pairwise(elements + [None]): self._do_line_word( canvas, - fontname, line_matrix, text, fontsize, @@ -328,7 +338,6 @@ class HocrTransform: def _do_line_word( self, canvas: Canvas, - fontname, line_matrix: Matrix, text: PikepdfText, fontsize: float, @@ -348,7 +357,7 @@ class HocrTransform: if hocr_box is None: return box = line_matrix.inverse().transform(hocr_box) - font_width = canvas.string_width(elemtxt, fontname, fontsize) + font_width = self._font.text_width(elemtxt, fontsize) # Debug sketches self._debug_draw_word_triangle(canvas, box) @@ -379,7 +388,7 @@ class HocrTransform: space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury) self._debug_draw_space_bbox(canvas, space_box) text.text_transform(Matrix(1, 0, 0, 1, space_box.llx, 0)) - space_width = canvas.string_width(' ', fontname, fontsize) + space_width = self._font.text_width(' ', fontsize) if space_width > 0: text.horiz_scale(100 * space_box.width / space_width) text.show(' ')