Refactor font so glyphless isn't as hard coded

This commit is contained in:
James R. Barlow
2023-12-02 08:55:01 -08:00
parent 11d3e32f1e
commit e97f89de3b
2 changed files with 135 additions and 104 deletions

View File

@@ -5,6 +5,7 @@ from __future__ import annotations
import logging
import unicodedata
import zlib
from contextlib import contextmanager
from dataclasses import dataclass
from enum import Enum
@@ -26,93 +27,119 @@ from .color import Color
log = logging.getLogger(__name__)
GLYPHLESS_FONT_NAME = 'pdf.ttf'
GLYPHLESS_FONT = (package_files('ocrmypdf.data') / GLYPHLESS_FONT_NAME).read_bytes()
CHAR_ASPECT = 2
class TextDirection(Enum):
LTR = 1
RTL = 2
LTR = 1 # Left to right: the default
RTL = 2 # Right to left: Arabic, Hebrew, Persian
def register_glyphlessfont(pdf: Pdf):
"""Register the glyphless font.
class Font:
def text_width(self, text: str, fontsize: float) -> int:
"""Estimate the width of a text string when rendered with the given font."""
raise NotImplementedError
Create several data structures in the Pdf to describe the font. While it create
the data, a reference should be set in at least one page's /Resources dictionary
to retain the font in the output PDF and ensure it is usable on that page.
"""
PLACEHOLDER = Name.Placeholder
def register(self, pdf: Pdf):
"""Register the font.
basefont = pdf.make_indirect(
Dictionary(
BaseFont=Name.GlyphLessFont,
DescendantFonts=[PLACEHOLDER],
Encoding=Name("/Identity-H"),
Subtype=Name.Type0,
ToUnicode=PLACEHOLDER,
Type=Name.Font,
Create several data structures in the Pdf to describe the font. While it create
the data, a reference should be set in at least one page's /Resources dictionary
to retain the font in the output PDF and ensure it is usable on that page.
"""
raise NotImplementedError
class GlyphlessFont(Font):
CID_TO_GID_DATA = zlib.compress(b"\x00\x01" * 65536)
GLYPHLESS_FONT_NAME = 'pdf.ttf'
GLYPHLESS_FONT = (package_files('ocrmypdf.data') / GLYPHLESS_FONT_NAME).read_bytes()
CHAR_ASPECT = 2
def __init__(self):
pass
def text_width(self, text: str, fontsize: float) -> int:
"""Estimate the width of a text string when rendered with the given font."""
# NFKC: split ligatures, combine diacritics
return len(unicodedata.normalize("NFKC", text)) * (fontsize / self.CHAR_ASPECT)
def register(self, pdf: Pdf):
"""Register the glyphless font.
Create several data structures in the Pdf to describe the font. While it create
the data, a reference should be set in at least one page's /Resources dictionary
to retain the font in the output PDF and ensure it is usable on that page.
"""
PLACEHOLDER = Name.Placeholder
basefont = pdf.make_indirect(
Dictionary(
BaseFont=Name.GlyphLessFont,
DescendantFonts=[PLACEHOLDER],
Encoding=Name("/Identity-H"),
Subtype=Name.Type0,
ToUnicode=PLACEHOLDER,
Type=Name.Font,
)
)
)
cid_font_type2 = pdf.make_indirect(
Dictionary(
BaseFont=Name.GlyphLessFont,
CIDToGIDMap=PLACEHOLDER,
CIDSystemInfo=Dictionary(
Ordering="Identity",
Registry="Adobe",
Supplement=0,
),
FontDescriptor=PLACEHOLDER,
Subtype=Name.CIDFontType2,
Type=Name.Font,
DW=1000 // CHAR_ASPECT,
cid_font_type2 = pdf.make_indirect(
Dictionary(
BaseFont=Name.GlyphLessFont,
CIDToGIDMap=PLACEHOLDER,
CIDSystemInfo=Dictionary(
Ordering="Identity",
Registry="Adobe",
Supplement=0,
),
FontDescriptor=PLACEHOLDER,
Subtype=Name.CIDFontType2,
Type=Name.Font,
DW=1000 // self.CHAR_ASPECT,
)
)
)
basefont.DescendantFonts = [cid_font_type2]
cid_font_type2.CIDToGIDMap = pdf.make_stream(b"\x00\x01" * 65536)
basefont.ToUnicode = pdf.make_stream(
b"/CIDInit /ProcSet findresource begin\n"
b"12 dict begin\n"
b"begincmap\n"
b"/CIDSystemInfo\n"
b"<<\n"
b" /Registry (Adobe)\n"
b" /Ordering (UCS)\n"
b" /Supplement 0\n"
b">> def\n"
b"/CMapName /Adobe-Identify-UCS def\n"
b"/CMapType 2 def\n"
b"1 begincodespacerange\n"
b"<0000> <FFFF>\n"
b"endcodespacerange\n"
b"1 beginbfrange\n"
b"<0000> <FFFF> <0000>\n"
b"endbfrange\n"
b"endcmap\n"
b"CMapName currentdict /CMap defineresource pop\n"
b"end\n"
b"end\n"
)
font_descriptor = pdf.make_indirect(
Dictionary(
Ascent=1000,
CapHeight=1000,
Descent=-1,
Flags=5, # Fixed pitch and symbolic
FontBBox=[0, 0, 1000 // CHAR_ASPECT, 1000],
FontFile2=PLACEHOLDER,
FontName=Name.GlyphLessFont,
ItalicAngle=0,
StemV=80,
Type=Name.FontDescriptor,
basefont.DescendantFonts = [cid_font_type2]
cid_font_type2.CIDToGIDMap = pdf.make_stream(
self.CID_TO_GID_DATA, Filter=Name.FlateDecode
)
)
font_descriptor.FontFile2 = pdf.make_stream(GLYPHLESS_FONT)
cid_font_type2.FontDescriptor = font_descriptor
return basefont
basefont.ToUnicode = pdf.make_stream(
b"/CIDInit /ProcSet findresource begin\n"
b"12 dict begin\n"
b"begincmap\n"
b"/CIDSystemInfo\n"
b"<<\n"
b" /Registry (Adobe)\n"
b" /Ordering (UCS)\n"
b" /Supplement 0\n"
b">> def\n"
b"/CMapName /Adobe-Identify-UCS def\n"
b"/CMapType 2 def\n"
b"1 begincodespacerange\n"
b"<0000> <FFFF>\n"
b"endcodespacerange\n"
b"1 beginbfrange\n"
b"<0000> <FFFF> <0000>\n"
b"endbfrange\n"
b"endcmap\n"
b"CMapName currentdict /CMap defineresource pop\n"
b"end\n"
b"end\n"
)
font_descriptor = pdf.make_indirect(
Dictionary(
Ascent=1000,
CapHeight=1000,
Descent=-1,
Flags=5, # Fixed pitch and symbolic
FontBBox=[0, 0, 1000 // self.CHAR_ASPECT, 1000],
FontFile2=PLACEHOLDER,
FontName=Name.GlyphLessFont,
ItalicAngle=0,
StemV=80,
Type=Name.FontDescriptor,
)
)
font_descriptor.FontFile2 = pdf.make_stream(self.GLYPHLESS_FONT)
cid_font_type2.FontDescriptor = font_descriptor
return basefont
class ContentStreamBuilder:
@@ -326,8 +353,7 @@ class _PikepdfCanvasAccessor:
def draw_image(self, image: Path | str | Image.Image, x, y, width, height):
"""Draw image at (x,y) with width w and height h."""
with self.save_state():
self.cm(Matrix(width, 0, 0, height, x, y))
with self.save_state(cm=Matrix(width, 0, 0, height, x, y)):
if isinstance(image, (Path, str)):
image = Image.open(image)
image.load()
@@ -395,23 +421,22 @@ class PikepdfCanvas:
self.page_size = page_size
self._pdf = Pdf.new()
self._page = self._pdf.add_blank_page(page_size=page_size)
self._page.Resources = Dictionary(Font=Dictionary(), XObject=Dictionary())
self._cs = ContentStreamBuilder()
self._images: list[LoadedImage] = []
self._accessor = _PikepdfCanvasAccessor(self._cs, self._images)
self._stack_depth = 0
self._font_name = Name("/f-0-0")
self.do.push()
def add_font(self, resource_name: Name, font: Font):
"""Add a font to the page."""
self._page.Resources.Font[resource_name] = font.register(self._pdf)
@property
def do(self) -> _PikepdfCanvasAccessor:
"""Do operations on the current graphics state."""
return self._accessor
def string_width(self, s: str, fontname, fontsize):
"""Estimate the width of a text string when rendered with the given font."""
# NFKC: split ligatures, combine diacritics
return len(unicodedata.normalize("NFKC", s)) * (fontsize / CHAR_ASPECT)
def _save_image(self, li: LoadedImage):
return self._pdf.make_stream(
li.image.tobytes(),
@@ -434,9 +459,6 @@ class PikepdfCanvas:
"rendering may be incorrect"
)
self._page.Contents = self._pdf.make_stream(self._cs.build())
self._page.MediaBox = [0, 0, *self.page_size]
self._page.Resources = Dictionary(Font=Dictionary(), XObject=Dictionary())
self._page.Resources.Font[self._font_name] = register_glyphlessfont(self._pdf)
for li in self._images:
self._page.Resources.XObject[li.name] = self._save_image(li)
self._pdf.save(output_file)
@@ -450,8 +472,8 @@ class PikepdfText:
self._cs.begin_text()
self._direction = direction
def font(self, font, size):
self._cs.set_text_font(Name("/f-0-0"), size)
def font(self, font: Name, size: float):
self._cs.set_text_font(font, size)
return self
def render_mode(self, mode):

View File

@@ -17,10 +17,15 @@ from math import atan, cos, pi
from pathlib import Path
from xml.etree import ElementTree
from pikepdf import Matrix, Rectangle
from pikepdf import Matrix, Name, Rectangle
from ocrmypdf.hocrtransform._canvas import (
Font,
GlyphlessFont,
PikepdfText,
TextDirection,
)
from ocrmypdf.hocrtransform._canvas import PikepdfCanvas as Canvas
from ocrmypdf.hocrtransform._canvas import PikepdfText, TextDirection
from ocrmypdf.hocrtransform.color import (
BLACK,
BLUE,
@@ -70,10 +75,20 @@ class HocrTransform:
re.VERBOSE,
)
def __init__(self, *, hocr_filename: str | Path, dpi: float, debug: bool = False):
def __init__(
self,
*,
hocr_filename: str | Path,
dpi: float,
debug: bool = False,
fontname: Name = Name("/f-0-0"),
font: Font = GlyphlessFont(),
):
"""Initialize the HocrTransform object."""
self.dpi = dpi
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
self._fontname = fontname
self._font = font
# if the hOCR file has a namespace, ElementTree requires its use to
# find elements
@@ -147,7 +162,6 @@ class HocrTransform:
*,
out_filename: Path,
image_filename: Path | None = None,
fontname: str = "Helvetica",
invisible_text: bool = True,
) -> None:
"""Creates a PDF file with an image superimposed on top of the text.
@@ -162,7 +176,6 @@ class HocrTransform:
out_filename: Path of PDF to write.
image_filename: Image to use for this file. If omitted, the OCR text
is shown.
fontname: Name of font to use.
invisible_text: If True, text is rendered invisible so that is
selectable but never drawn. If False, text is visible and may
be seen if the image is skipped or deleted in Acrobat.
@@ -170,6 +183,7 @@ class HocrTransform:
# create the PDF file
# page size in points (1/72 in.)
canvas = Canvas(page_size=(self.width, self.height))
canvas.add_font(self._fontname, self._font)
page_matrix = (
Matrix()
.translated(0, self.height)
@@ -195,7 +209,6 @@ class HocrTransform:
canvas,
line,
"ocrx_word",
fontname,
invisible_text,
direction,
inject_word_breaks,
@@ -208,7 +221,6 @@ class HocrTransform:
canvas,
root,
"ocrx_word",
fontname,
invisible_text,
TextDirection.LTR,
True,
@@ -256,7 +268,6 @@ class HocrTransform:
canvas: Canvas,
line: Element | None,
elemclass: str,
fontname: str,
invisible_text: bool,
text_direction: TextDirection,
inject_word_breaks: bool,
@@ -301,7 +312,7 @@ class HocrTransform:
# on a sloped baseline and the edge of the bounding box.
line_box_height = abs(line_box.height) / cos(angle)
fontsize = line_box_height + intercept
text.font(fontname, fontsize)
text.font(self._fontname, fontsize)
if invisible_text or True:
text.render_mode(3) # Invisible (indicates OCR text)
@@ -314,7 +325,6 @@ class HocrTransform:
for elem, next_elem in pairwise(elements + [None]):
self._do_line_word(
canvas,
fontname,
line_matrix,
text,
fontsize,
@@ -328,7 +338,6 @@ class HocrTransform:
def _do_line_word(
self,
canvas: Canvas,
fontname,
line_matrix: Matrix,
text: PikepdfText,
fontsize: float,
@@ -348,7 +357,7 @@ class HocrTransform:
if hocr_box is None:
return
box = line_matrix.inverse().transform(hocr_box)
font_width = canvas.string_width(elemtxt, fontname, fontsize)
font_width = self._font.text_width(elemtxt, fontsize)
# Debug sketches
self._debug_draw_word_triangle(canvas, box)
@@ -379,7 +388,7 @@ class HocrTransform:
space_box = Rectangle(next_box.urx, box.lly, box.llx, next_box.ury)
self._debug_draw_space_bbox(canvas, space_box)
text.text_transform(Matrix(1, 0, 0, 1, space_box.llx, 0))
space_width = canvas.string_width(' ', fontname, fontsize)
space_width = self._font.text_width(' ', fontsize)
if space_width > 0:
text.horiz_scale(100 * space_box.width / space_width)
text.show(' ')