Refactor font so glyphless isn't as hard coded

2026-05-18 19:47:48 -04:00 · 2023-12-02 08:55:01 -08:00
parent 11d3e32f1e
commit e97f89de3b
2 changed files with 135 additions and 104 deletions
--- a/src/ocrmypdf/hocrtransform/_canvas.py
+++ b/src/ocrmypdf/hocrtransform/_canvas.py
@@ -5,6 +5,7 @@ from __future__ import annotations

 import logging
 import unicodedata
+import zlib
 from contextlib import contextmanager
 from dataclasses import dataclass
 from enum import Enum
@@ -26,93 +27,119 @@ from .color import Color

 log = logging.getLogger(__name__)

-GLYPHLESS_FONT_NAME = 'pdf.ttf'
-
-GLYPHLESS_FONT = (package_files('ocrmypdf.data') / GLYPHLESS_FONT_NAME).read_bytes()
-CHAR_ASPECT = 2
-

 class TextDirection(Enum):
-    LTR = 1
-    RTL = 2
+    LTR = 1  # Left to right: the default
+    RTL = 2  # Right to left: Arabic, Hebrew, Persian


-def register_glyphlessfont(pdf: Pdf):
-    """Register the glyphless font.
+class Font:
+    def text_width(self, text: str, fontsize: float) -> int:
+        """Estimate the width of a text string when rendered with the given font."""
+        raise NotImplementedError

-    Create several data structures in the Pdf to describe the font. While it create
-    the data, a reference should be set in at least one page's /Resources dictionary
-    to retain the font in the output PDF and ensure it is usable on that page.
-    """
-    PLACEHOLDER = Name.Placeholder
+    def register(self, pdf: Pdf):
+        """Register the font.

-    basefont = pdf.make_indirect(
-        Dictionary(
-            BaseFont=Name.GlyphLessFont,
-            DescendantFonts=[PLACEHOLDER],
-            Encoding=Name("/Identity-H"),
-            Subtype=Name.Type0,
-            ToUnicode=PLACEHOLDER,
-            Type=Name.Font,
+        Create several data structures in the Pdf to describe the font. While it create
+        the data, a reference should be set in at least one page's /Resources dictionary
+        to retain the font in the output PDF and ensure it is usable on that page.
+        """
+        raise NotImplementedError
+
+
+class GlyphlessFont(Font):
+    CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536)
+    GLYPHLESS_FONT_NAME = 'pdf.ttf'
+    GLYPHLESS_FONT = (package_files('ocrmypdf.data') / GLYPHLESS_FONT_NAME).read_bytes()
+    CHAR_ASPECT = 2
+
+    def __init__(self):
+        pass
+
+    def text_width(self, text: str, fontsize: float) -> int:
+        """Estimate the width of a text string when rendered with the given font."""
+        # NFKC: split ligatures, combine diacritics
+        return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT)
+
+    def register(self, pdf: Pdf):
+        """Register the glyphless font.
+
+        Create several data structures in the Pdf to describe the font. While it create
+        the data, a reference should be set in at least one page's /Resources dictionary
+        to retain the font in the output PDF and ensure it is usable on that page.
+        """
+        PLACEHOLDER = Name.Placeholder
+
+        basefont = pdf.make_indirect(
+            Dictionary(
+                BaseFont=Name.GlyphLessFont,
+                DescendantFonts=[PLACEHOLDER],
+                Encoding=Name("/Identity-H"),
+                Subtype=Name.Type0,
+                ToUnicode=PLACEHOLDER,
+                Type=Name.Font,
+            )
        )
-    )
-    cid_font_type2 = pdf.make_indirect(
-        Dictionary(
-            BaseFont=Name.GlyphLessFont,
-            CIDToGIDMap=PLACEHOLDER,
-            CIDSystemInfo=Dictionary(
-                Ordering="Identity",
-                Registry="Adobe",
-                Supplement=0,
-            ),
-            FontDescriptor=PLACEHOLDER,
-            Subtype=Name.CIDFontType2,
-            Type=Name.Font,
-            DW=1000 // CHAR_ASPECT,
+        cid_font_type2 = pdf.make_indirect(
+            Dictionary(
+                BaseFont=Name.GlyphLessFont,
+                CIDToGIDMap=PLACEHOLDER,
+                CIDSystemInfo=Dictionary(
+                    Ordering="Identity",
+                    Registry="Adobe",
+                    Supplement=0,
+                ),
+                FontDescriptor=PLACEHOLDER,
+                Subtype=Name.CIDFontType2,
+                Type=Name.Font,
+                DW=1000 // self.CHAR_ASPECT,
+            )
        )
-    )
-    basefont.DescendantFonts = [cid_font_type2]
-    cid_font_type2.CIDToGIDMap = pdf.make_stream(b"\x00\x01" * 65536)
-    basefont.ToUnicode = pdf.make_stream(
-        b"/CIDInit /ProcSet findresource begin\n"
-        b"12 dict begin\n"
-        b"begincmap\n"
-        b"/CIDSystemInfo\n"
-        b"<<\n"
-        b"  /Registry (Adobe)\n"
-        b"  /Ordering (UCS)\n"
-        b"  /Supplement 0\n"
-        b">> def\n"
-        b"/CMapName /Adobe-Identify-UCS def\n"
-        b"/CMapType 2 def\n"
-        b"1 begincodespacerange\n"
-        b"<0000> <FFFF>\n"
-        b"endcodespacerange\n"
-        b"1 beginbfrange\n"
-        b"<0000> <FFFF> <0000>\n"
-        b"endbfrange\n"
-        b"endcmap\n"
-        b"CMapName currentdict /CMap defineresource pop\n"
-        b"end\n"
-        b"end\n"
-    )
-    font_descriptor = pdf.make_indirect(
-        Dictionary(
-            Ascent=1000,
-            CapHeight=1000,
-            Descent=-1,
-            Flags=5,  # Fixed pitch and symbolic
-            FontBBox=[0, 0, 1000 // CHAR_ASPECT, 1000],
-            FontFile2=PLACEHOLDER,
-            FontName=Name.GlyphLessFont,
-            ItalicAngle=0,
-            StemV=80,
-            Type=Name.FontDescriptor,
+        basefont.DescendantFonts = [cid_font_type2]
+        cid_font_type2.CIDToGIDMap = pdf.make_stream(
+            self.CID_TO_GID_DATA, Filter=Name.FlateDecode
        )
-    )
-    font_descriptor.FontFile2 = pdf.make_stream(GLYPHLESS_FONT)
-    cid_font_type2.FontDescriptor = font_descriptor
-    return basefont
+        basefont.ToUnicode = pdf.make_stream(
+            b"/CIDInit /ProcSet findresource begin\n"
+            b"12 dict begin\n"
+            b"begincmap\n"
+            b"/CIDSystemInfo\n"
+            b"<<\n"
+            b"  /Registry (Adobe)\n"
+            b"  /Ordering (UCS)\n"
+            b"  /Supplement 0\n"
+            b">> def\n"
+            b"/CMapName /Adobe-Identify-UCS def\n"
+            b"/CMapType 2 def\n"
+            b"1 begincodespacerange\n"
+            b"<0000> <FFFF>\n"
+            b"endcodespacerange\n"
+            b"1 beginbfrange\n"
+            b"<0000> <FFFF> <0000>\n"
+            b"endbfrange\n"
+            b"endcmap\n"
+            b"CMapName currentdict /CMap defineresource pop\n"
+            b"end\n"
+            b"end\n"
+        )
+        font_descriptor = pdf.make_indirect(
+            Dictionary(
+                Ascent=1000,
+                CapHeight=1000,
+                Descent=-1,
+                Flags=5,  # Fixed pitch and symbolic
+                FontBBox=[0, 0, 1000 // self.CHAR_ASPECT, 1000],
+                FontFile2=PLACEHOLDER,
+                FontName=Name.GlyphLessFont,
+                ItalicAngle=0,
+                StemV=80,
+                Type=Name.FontDescriptor,
+            )
+        )
+        font_descriptor.FontFile2 = pdf.make_stream(self.GLYPHLESS_FONT)
+        cid_font_type2.FontDescriptor = font_descriptor
+        return basefont


 class ContentStreamBuilder:
@@ -326,8 +353,7 @@ class _PikepdfCanvasAccessor:

    def draw_image(self, image: Path | str | Image.Image, x, y, width, height):
        """Draw image at (x,y) with width w and height h."""
-        with self.save_state():
-            self.cm(Matrix(width, 0, 0, height, x, y))
+        with self.save_state(cm=Matrix(width, 0, 0, height, x, y)):
            if isinstance(image, (Path, str)):
                image = Image.open(image)
            image.load()
@@ -395,23 +421,22 @@ class PikepdfCanvas:
        self.page_size = page_size
        self._pdf = Pdf.new()
        self._page = self._pdf.add_blank_page(page_size=page_size)
+        self._page.Resources = Dictionary(Font=Dictionary(), XObject=Dictionary())
        self._cs = ContentStreamBuilder()
        self._images: list[LoadedImage] = []
        self._accessor = _PikepdfCanvasAccessor(self._cs, self._images)
        self._stack_depth = 0
-        self._font_name = Name("/f-0-0")
        self.do.push()

+    def add_font(self, resource_name: Name, font: Font):
+        """Add a font to the page."""
+        self._page.Resources.Font[resource_name] = font.register(self._pdf)
+
    @property
    def do(self) -> _PikepdfCanvasAccessor:
        """Do operations on the current graphics state."""
        return self._accessor

-    def string_width(self, s: str, fontname, fontsize):
-        """Estimate the width of a text string when rendered with the given font."""
-        # NFKC: split ligatures, combine diacritics
-        return len(unicodedata.normalize("NFKC", s)) * (fontsize / CHAR_ASPECT)
-
    def _save_image(self, li: LoadedImage):
        return self._pdf.make_stream(
            li.image.tobytes(),
@@ -434,9 +459,6 @@ class PikepdfCanvas:
                "rendering may be incorrect"
            )
        self._page.Contents = self._pdf.make_stream(self._cs.build())
-        self._page.MediaBox = [0, 0, *self.page_size]
-        self._page.Resources = Dictionary(Font=Dictionary(), XObject=Dictionary())
-        self._page.Resources.Font[self._font_name] = register_glyphlessfont(self._pdf)
        for li in self._images:
            self._page.Resources.XObject[li.name] = self._save_image(li)
        self._pdf.save(output_file)
@@ -450,8 +472,8 @@ class PikepdfText:
        self._cs.begin_text()
        self._direction = direction

-    def font(self, font, size):
-        self._cs.set_text_font(Name("/f-0-0"), size)
+    def font(self, font: Name, size: float):
+        self._cs.set_text_font(font, size)
        return self

    def render_mode(self, mode):
--- a/src/ocrmypdf/hocrtransform/_hocr.py
+++ b/src/ocrmypdf/hocrtransform/_hocr.py
@@ -17,10 +17,15 @@ from math import atan, cos, pi
 from pathlib import Path
 from xml.etree import ElementTree

-from pikepdf import Matrix, Rectangle
+from pikepdf import Matrix, Name, Rectangle

+from ocrmypdf.hocrtransform._canvas import (
+    Font,
+    GlyphlessFont,
+    PikepdfText,
+    TextDirection,
+)
 from ocrmypdf.hocrtransform._canvas import PikepdfCanvas as Canvas
-from ocrmypdf.hocrtransform._canvas import PikepdfText, TextDirection
 from ocrmypdf.hocrtransform.color import (
    BLACK,
    BLUE,
@@ -70,10 +75,20 @@ class HocrTransform:
        re.VERBOSE,
    )

-    def __init__(self, *, hocr_filename: str | Path, dpi: float, debug: bool = False):
+    def __init__(
+        self,
+        *,
+        hocr_filename: str | Path,
+        dpi: float,
+        debug: bool = False,
+        fontname: Name = Name("/f-0-0"),
+        font: Font = GlyphlessFont(),
+    ):
        """Initialize the HocrTransform object."""
        self.dpi = dpi
        self.hocr = ElementTree.parse(os.fspath(hocr_filename))
+        self._fontname = fontname
+        self._font = font

        # if the hOCR file has a namespace, ElementTree requires its use to
        # find elements
@@ -147,7 +162,6 @@ class HocrTransform:
        *,
        out_filename: Path,
        image_filename: Path | None = None,
-        fontname: str = "Helvetica",
        invisible_text: bool = True,
    ) -> None:
        """Creates a PDF file with an image superimposed on top of the text.
@@ -162,7 +176,6 @@ class HocrTransform:
            out_filename: Path of PDF to write.
            image_filename: Image to use for this file. If omitted, the OCR text
                is shown.
-            fontname: Name of font to use.
            invisible_text: If True, text is rendered invisible so that is
                selectable but never drawn. If False, text is visible and may
                be seen if the image is skipped or deleted in Acrobat.
@@ -170,6 +183,7 @@ class HocrTransform:
        # create the PDF file
        # page size in points (1/72 in.)
        canvas = Canvas(page_size=(self.width, self.height))
+        canvas.add_font(self._fontname, self._font)
        page_matrix = (
            Matrix()
            .translated(0, self.height)
@@ -195,7 +209,6 @@ class HocrTransform:
                        canvas,
                        line,
                        "ocrx_word",
-                        fontname,
                        invisible_text,
                        direction,
                        inject_word_breaks,
@@ -208,7 +221,6 @@ class HocrTransform:
                    canvas,
                    root,
                    "ocrx_word",
-                    fontname,
                    invisible_text,
                    TextDirection.LTR,
                    True,
@@ -256,7 +268,6 @@ class HocrTransform:
        canvas: Canvas,
        line: Element | None,
        elemclass: str,
-        fontname: str,
        invisible_text: bool,
        text_direction: TextDirection,
        inject_word_breaks: bool,
@@ -301,7 +312,7 @@ class HocrTransform:
            # on a sloped baseline and the edge of the bounding box.
            line_box_height = abs(line_box.height) / cos(angle)
            fontsize = line_box_height + intercept
-            text.font(fontname, fontsize)
+            text.font(self._fontname, fontsize)
            if invisible_text or True:
                text.render_mode(3)  # Invisible (indicates OCR text)

@@ -314,7 +325,6 @@ class HocrTransform:
            for elem, next_elem in pairwise(elements + [None]):
                self._do_line_word(
                    canvas,
-                    fontname,
                    line_matrix,
                    text,
                    fontsize,
@@ -328,7 +338,6 @@ class HocrTransform:
    def _do_line_word(
        self,
        canvas: Canvas,
-        fontname,
        line_matrix: Matrix,
        text: PikepdfText,
        fontsize: float,
@@ -348,7 +357,7 @@ class HocrTransform:
        if hocr_box is None:
            return
        box = line_matrix.inverse().transform(hocr_box)
-        font_width = canvas.string_width(elemtxt, fontname, fontsize)
+        font_width = self._font.text_width(elemtxt, fontsize)

        # Debug sketches
        self._debug_draw_word_triangle(canvas, box)
@@ -379,7 +388,7 @@ class HocrTransform:
            space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury)
        self._debug_draw_space_bbox(canvas, space_box)
        text.text_transform(Matrix(1, 0, 0, 1, space_box.llx, 0))
-        space_width = canvas.string_width(' ', fontname, fontsize)
+        space_width = self._font.text_width(' ', fontsize)
        if space_width > 0:
            text.horiz_scale(100 * space_box.width / space_width)
            text.show(' ')