From e97f89de3beb6b993513cde011ccfa7ec6a2a3e3 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <james@purplerock.ca>
Date: Sat, 2 Dec 2023 08:55:01 -0800
Subject: [PATCH] Refactor font so glyphless isn't as hard coded

---
 src/ocrmypdf/hocrtransform/_canvas.py | 204 ++++++++++++++------------
 src/ocrmypdf/hocrtransform/_hocr.py   |  35 +++--
 2 files changed, 135 insertions(+), 104 deletions(-)

diff --git a/src/ocrmypdf/hocrtransform/_canvas.py b/src/ocrmypdf/hocrtransform/_canvas.py
index 11a549ec..caeecae1 100644
--- a/src/ocrmypdf/hocrtransform/_canvas.py
+++ b/src/ocrmypdf/hocrtransform/_canvas.py
@@ -5,6 +5,7 @@ from __future__ import annotations
 
 import logging
 import unicodedata
+import zlib
 from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum
@@ -26,93 +27,119 @@ from .color import Color
 
 log = logging.getLogger(__name__)
 
-GLYPHLESS_FONT_NAME = 'pdf.ttf'
-
-GLYPHLESS_FONT = (package_files('ocrmypdf.data') / GLYPHLESS_FONT_NAME).read_bytes()
-CHAR_ASPECT = 2
-
 
 class TextDirection(Enum):
-    LTR = 1
-    RTL = 2
+    LTR = 1  # Left to right: the default
+    RTL = 2  # Right to left: Arabic, Hebrew, Persian
 
 
-def register_glyphlessfont(pdf: Pdf):
-    """Register the glyphless font.
+class Font:
+    def text_width(self, text: str, fontsize: float) -> int:
+        """Estimate the width of a text string when rendered with the given font."""
+        raise NotImplementedError
 
-    Create several data structures in the Pdf to describe the font. While it create
-    the data, a reference should be set in at least one page's /Resources dictionary
-    to retain the font in the output PDF and ensure it is usable on that page.
-    """
-    PLACEHOLDER = Name.Placeholder
+    def register(self, pdf: Pdf):
+        """Register the font.
 
-    basefont = pdf.make_indirect(
-        Dictionary(
-            BaseFont=Name.GlyphLessFont,
-            DescendantFonts=[PLACEHOLDER],
-            Encoding=Name("/Identity-H"),
-            Subtype=Name.Type0,
-            ToUnicode=PLACEHOLDER,
-            Type=Name.Font,
+        Create several data structures in the Pdf to describe the font. While it create
+        the data, a reference should be set in at least one page's /Resources dictionary
+        to retain the font in the output PDF and ensure it is usable on that page.
+        """
+        raise NotImplementedError
+
+
+class GlyphlessFont(Font):
+    CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536)
+    GLYPHLESS_FONT_NAME = 'pdf.ttf'
+    GLYPHLESS_FONT = (package_files('ocrmypdf.data') / GLYPHLESS_FONT_NAME).read_bytes()
+    CHAR_ASPECT = 2
+
+    def __init__(self):
+        pass
+
+    def text_width(self, text: str, fontsize: float) -> int:
+        """Estimate the width of a text string when rendered with the given font."""
+        # NFKC: split ligatures, combine diacritics
+        return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT)
+
+    def register(self, pdf: Pdf):
+        """Register the glyphless font.
+
+        Create several data structures in the Pdf to describe the font. While it create
+        the data, a reference should be set in at least one page's /Resources dictionary
+        to retain the font in the output PDF and ensure it is usable on that page.
+        """
+        PLACEHOLDER = Name.Placeholder
+
+        basefont = pdf.make_indirect(
+            Dictionary(
+                BaseFont=Name.GlyphLessFont,
+                DescendantFonts=[PLACEHOLDER],
+                Encoding=Name("/Identity-H"),
+                Subtype=Name.Type0,
+                ToUnicode=PLACEHOLDER,
+                Type=Name.Font,
+            )
         )
-    )
-    cid_font_type2 = pdf.make_indirect(
-        Dictionary(
-            BaseFont=Name.GlyphLessFont,
-            CIDToGIDMap=PLACEHOLDER,
-            CIDSystemInfo=Dictionary(
-                Ordering="Identity",
-                Registry="Adobe",
-                Supplement=0,
-            ),
-            FontDescriptor=PLACEHOLDER,
-            Subtype=Name.CIDFontType2,
-            Type=Name.Font,
-            DW=1000 // CHAR_ASPECT,
+        cid_font_type2 = pdf.make_indirect(
+            Dictionary(
+                BaseFont=Name.GlyphLessFont,
+                CIDToGIDMap=PLACEHOLDER,
+                CIDSystemInfo=Dictionary(
+                    Ordering="Identity",
+                    Registry="Adobe",
+                    Supplement=0,
+                ),
+                FontDescriptor=PLACEHOLDER,
+                Subtype=Name.CIDFontType2,
+                Type=Name.Font,
+                DW=1000 // self.CHAR_ASPECT,
+            )
         )
-    )
-    basefont.DescendantFonts = [cid_font_type2]
-    cid_font_type2.CIDToGIDMap = pdf.make_stream(b"\x00\x01" * 65536)
-    basefont.ToUnicode = pdf.make_stream(
-        b"/CIDInit /ProcSet findresource begin\n"
-        b"12 dict begin\n"
-        b"begincmap\n"
-        b"/CIDSystemInfo\n"
-        b"<<\n"
-        b"  /Registry (Adobe)\n"
-        b"  /Ordering (UCS)\n"
-        b"  /Supplement 0\n"
-        b">> def\n"
-        b"/CMapName /Adobe-Identify-UCS def\n"
-        b"/CMapType 2 def\n"
-        b"1 begincodespacerange\n"
-        b"<0000> <FFFF>\n"
-        b"endcodespacerange\n"
-        b"1 beginbfrange\n"
-        b"<0000> <FFFF> <0000>\n"
-        b"endbfrange\n"
-        b"endcmap\n"
-        b"CMapName currentdict /CMap defineresource pop\n"
-        b"end\n"
-        b"end\n"
-    )
-    font_descriptor = pdf.make_indirect(
-        Dictionary(
-            Ascent=1000,
-            CapHeight=1000,
-            Descent=-1,
-            Flags=5,  # Fixed pitch and symbolic
-            FontBBox=[0, 0, 1000 // CHAR_ASPECT, 1000],
-            FontFile2=PLACEHOLDER,
-            FontName=Name.GlyphLessFont,
-            ItalicAngle=0,
-            StemV=80,
-            Type=Name.FontDescriptor,
+        basefont.DescendantFonts = [cid_font_type2]
+        cid_font_type2.CIDToGIDMap = pdf.make_stream(
+            self.CID_TO_GID_DATA, Filter=Name.FlateDecode
         )
-    )
-    font_descriptor.FontFile2 = pdf.make_stream(GLYPHLESS_FONT)
-    cid_font_type2.FontDescriptor = font_descriptor
-    return basefont
+        basefont.ToUnicode = pdf.make_stream(
+            b"/CIDInit /ProcSet findresource begin\n"
+            b"12 dict begin\n"
+            b"begincmap\n"
+            b"/CIDSystemInfo\n"
+            b"<<\n"
+            b"  /Registry (Adobe)\n"
+            b"  /Ordering (UCS)\n"
+            b"  /Supplement 0\n"
+            b">> def\n"
+            b"/CMapName /Adobe-Identify-UCS def\n"
+            b"/CMapType 2 def\n"
+            b"1 begincodespacerange\n"
+            b"<0000> <FFFF>\n"
+            b"endcodespacerange\n"
+            b"1 beginbfrange\n"
+            b"<0000> <FFFF> <0000>\n"
+            b"endbfrange\n"
+            b"endcmap\n"
+            b"CMapName currentdict /CMap defineresource pop\n"
+            b"end\n"
+            b"end\n"
+        )
+        font_descriptor = pdf.make_indirect(
+            Dictionary(
+                Ascent=1000,
+                CapHeight=1000,
+                Descent=-1,
+                Flags=5,  # Fixed pitch and symbolic
+                FontBBox=[0, 0, 1000 // self.CHAR_ASPECT, 1000],
+                FontFile2=PLACEHOLDER,
+                FontName=Name.GlyphLessFont,
+                ItalicAngle=0,
+                StemV=80,
+                Type=Name.FontDescriptor,
+            )
+        )
+        font_descriptor.FontFile2 = pdf.make_stream(self.GLYPHLESS_FONT)
+        cid_font_type2.FontDescriptor = font_descriptor
+        return basefont
 
 
 class ContentStreamBuilder:
@@ -326,8 +353,7 @@ class _PikepdfCanvasAccessor:
 
     def draw_image(self, image: Path | str | Image.Image, x, y, width, height):
         """Draw image at (x,y) with width w and height h."""
-        with self.save_state():
-            self.cm(Matrix(width, 0, 0, height, x, y))
+        with self.save_state(cm=Matrix(width, 0, 0, height, x, y)):
             if isinstance(image, (Path, str)):
                 image = Image.open(image)
             image.load()
@@ -395,23 +421,22 @@ class PikepdfCanvas:
         self.page_size = page_size
         self._pdf = Pdf.new()
         self._page = self._pdf.add_blank_page(page_size=page_size)
+        self._page.Resources = Dictionary(Font=Dictionary(), XObject=Dictionary())
         self._cs = ContentStreamBuilder()
         self._images: list[LoadedImage] = []
         self._accessor = _PikepdfCanvasAccessor(self._cs, self._images)
         self._stack_depth = 0
-        self._font_name = Name("/f-0-0")
         self.do.push()
 
+    def add_font(self, resource_name: Name, font: Font):
+        """Add a font to the page."""
+        self._page.Resources.Font[resource_name] = font.register(self._pdf)
+
     @property
     def do(self) -> _PikepdfCanvasAccessor:
         """Do operations on the current graphics state."""
         return self._accessor
 
-    def string_width(self, s: str, fontname, fontsize):
-        """Estimate the width of a text string when rendered with the given font."""
-        # NFKC: split ligatures, combine diacritics
-        return len(unicodedata.normalize("NFKC", s)) * (fontsize / CHAR_ASPECT)
-
     def _save_image(self, li: LoadedImage):
         return self._pdf.make_stream(
             li.image.tobytes(),
@@ -434,9 +459,6 @@ class PikepdfCanvas:
                 "rendering may be incorrect"
             )
         self._page.Contents = self._pdf.make_stream(self._cs.build())
-        self._page.MediaBox = [0, 0, *self.page_size]
-        self._page.Resources = Dictionary(Font=Dictionary(), XObject=Dictionary())
-        self._page.Resources.Font[self._font_name] = register_glyphlessfont(self._pdf)
         for li in self._images:
             self._page.Resources.XObject[li.name] = self._save_image(li)
         self._pdf.save(output_file)
@@ -450,8 +472,8 @@ class PikepdfText:
         self._cs.begin_text()
         self._direction = direction
 
-    def font(self, font, size):
-        self._cs.set_text_font(Name("/f-0-0"), size)
+    def font(self, font: Name, size: float):
+        self._cs.set_text_font(font, size)
         return self
 
     def render_mode(self, mode):
diff --git a/src/ocrmypdf/hocrtransform/_hocr.py b/src/ocrmypdf/hocrtransform/_hocr.py
index 2c1e7c44..b75e7b0c 100644
--- a/src/ocrmypdf/hocrtransform/_hocr.py
+++ b/src/ocrmypdf/hocrtransform/_hocr.py
@@ -17,10 +17,15 @@ from math import atan, cos, pi
 from pathlib import Path
 from xml.etree import ElementTree
 
-from pikepdf import Matrix, Rectangle
+from pikepdf import Matrix, Name, Rectangle
 
+from ocrmypdf.hocrtransform._canvas import (
+    Font,
+    GlyphlessFont,
+    PikepdfText,
+    TextDirection,
+)
 from ocrmypdf.hocrtransform._canvas import PikepdfCanvas as Canvas
-from ocrmypdf.hocrtransform._canvas import PikepdfText, TextDirection
 from ocrmypdf.hocrtransform.color import (
     BLACK,
     BLUE,
@@ -70,10 +75,20 @@ class HocrTransform:
         re.VERBOSE,
     )
 
-    def __init__(self, *, hocr_filename: str | Path, dpi: float, debug: bool = False):
+    def __init__(
+        self,
+        *,
+        hocr_filename: str | Path,
+        dpi: float,
+        debug: bool = False,
+        fontname: Name = Name("/f-0-0"),
+        font: Font = GlyphlessFont(),
+    ):
         """Initialize the HocrTransform object."""
         self.dpi = dpi
         self.hocr = ElementTree.parse(os.fspath(hocr_filename))
+        self._fontname = fontname
+        self._font = font
 
         # if the hOCR file has a namespace, ElementTree requires its use to
         # find elements
@@ -147,7 +162,6 @@ class HocrTransform:
         *,
         out_filename: Path,
         image_filename: Path | None = None,
-        fontname: str = "Helvetica",
         invisible_text: bool = True,
     ) -> None:
         """Creates a PDF file with an image superimposed on top of the text.
@@ -162,7 +176,6 @@ class HocrTransform:
             out_filename: Path of PDF to write.
             image_filename: Image to use for this file. If omitted, the OCR text
                 is shown.
-            fontname: Name of font to use.
             invisible_text: If True, text is rendered invisible so that is
                 selectable but never drawn. If False, text is visible and may
                 be seen if the image is skipped or deleted in Acrobat.
@@ -170,6 +183,7 @@ class HocrTransform:
         # create the PDF file
         # page size in points (1/72 in.)
         canvas = Canvas(page_size=(self.width, self.height))
+        canvas.add_font(self._fontname, self._font)
         page_matrix = (
             Matrix()
             .translated(0, self.height)
@@ -195,7 +209,6 @@ class HocrTransform:
                         canvas,
                         line,
                         "ocrx_word",
-                        fontname,
                         invisible_text,
                         direction,
                         inject_word_breaks,
@@ -208,7 +221,6 @@ class HocrTransform:
                     canvas,
                     root,
                     "ocrx_word",
-                    fontname,
                     invisible_text,
                     TextDirection.LTR,
                     True,
@@ -256,7 +268,6 @@ class HocrTransform:
         canvas: Canvas,
         line: Element | None,
         elemclass: str,
-        fontname: str,
         invisible_text: bool,
         text_direction: TextDirection,
         inject_word_breaks: bool,
@@ -301,7 +312,7 @@ class HocrTransform:
             # on a sloped baseline and the edge of the bounding box.
             line_box_height = abs(line_box.height) / cos(angle)
             fontsize = line_box_height + intercept
-            text.font(fontname, fontsize)
+            text.font(self._fontname, fontsize)
             if invisible_text or True:
                 text.render_mode(3)  # Invisible (indicates OCR text)
 
@@ -314,7 +325,6 @@ class HocrTransform:
             for elem, next_elem in pairwise(elements + [None]):
                 self._do_line_word(
                     canvas,
-                    fontname,
                     line_matrix,
                     text,
                     fontsize,
@@ -328,7 +338,6 @@ class HocrTransform:
     def _do_line_word(
         self,
         canvas: Canvas,
-        fontname,
         line_matrix: Matrix,
         text: PikepdfText,
         fontsize: float,
@@ -348,7 +357,7 @@ class HocrTransform:
         if hocr_box is None:
             return
         box = line_matrix.inverse().transform(hocr_box)
-        font_width = canvas.string_width(elemtxt, fontname, fontsize)
+        font_width = self._font.text_width(elemtxt, fontsize)
 
         # Debug sketches
         self._debug_draw_word_triangle(canvas, box)
@@ -379,7 +388,7 @@ class HocrTransform:
             space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury)
         self._debug_draw_space_bbox(canvas, space_box)
         text.text_transform(Matrix(1, 0, 0, 1, space_box.llx, 0))
-        space_width = canvas.string_width(' ', fontname, fontsize)
+        space_width = self._font.text_width(' ', fontsize)
         if space_width > 0:
             text.horiz_scale(100 * space_box.width / space_width)
             text.show(' ')