Files
OCRmyPDF/scripts/generate_glyphless_font.py
James R. Barlow 64726f97b3 Add font infrastructure and glyphless font
- Add font module with FontManager, FontProvider, MultiFontManager,
  and SystemFontProvider for multilingual font support
- Add NotoSans-Regular.ttf for Latin text rendering
- Replace pdf.ttf with Occulta.ttf glyphless font
- Add script to generate new Occulta glyphless font
- System font discovery for CJK, Arabic, Devanagari scripts
2026-01-06 13:44:54 -08:00

232 lines
6.5 KiB
Python

#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2024 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Generate the Occulta glyphless font for OCRmyPDF.
Occulta (Latin for "hidden") is a glyphless font designed for invisible text layers
in searchable PDFs. It has proper Unicode cmap coverage using format 13 (many-to-one)
for efficient mapping of all BMP codepoints to a small set of width-specific glyphs.
Features:
- Full BMP coverage (U+0000 to U+FFFF)
- Width-aware glyphs for proper text selection:
- Zero-width for combining marks and invisible characters
- Regular width (500 units) for Latin, Greek, Cyrillic, Arabic, Hebrew, etc.
- Double width (1000 units) for CJK and fullwidth characters
- Uses cmap format 13 (many-to-one) for ~12KB size vs ~780KB with format 12
- Compatible with fpdf2 and other modern PDF libraries
Usage:
python scripts/generate_glyphless_font.py
Output:
src/ocrmypdf/data/Occulta.ttf
"""
from __future__ import annotations
import unicodedata
from pathlib import Path
from fontTools.fontBuilder import FontBuilder
from fontTools.ttLib import TTFont
from fontTools.ttLib.tables._c_m_a_p import CmapSubtable
from fontTools.ttLib.tables._g_l_y_f import Glyph
# Output path relative to this script
OUTPUT_PATH = Path(__file__).parent.parent / "src" / "ocrmypdf" / "data" / "Occulta.ttf"
# Font metrics (units per em = 1000)
UNITS_PER_EM = 1000
ASCENT = 800
DESCENT = -200
# Glyph definitions: (name, advance_width, left_side_bearing)
GLYPHS = [
(".notdef", 500, 0), # Required, used for unmapped characters
("space", 500, 0), # U+0020 SPACE
("nbspace", 500, 0), # U+00A0 NO-BREAK SPACE
("blank0", 0, 0), # Zero-width (combining marks, ZWNJ, ZWJ, BOM)
("blank1", 500, 0), # Regular width (most scripts)
("blank2", 1000, 0), # Double width (CJK, fullwidth)
]
# Explicit zero-width character codepoints
ZERO_WIDTH_CHARS = frozenset(
[
0x200B, # ZERO WIDTH SPACE
0x200C, # ZERO WIDTH NON-JOINER
0x200D, # ZERO WIDTH JOINER
0xFEFF, # ZERO WIDTH NO-BREAK SPACE (BOM)
0x200E, # LEFT-TO-RIGHT MARK
0x200F, # RIGHT-TO-LEFT MARK
0x202A, # LEFT-TO-RIGHT EMBEDDING
0x202B, # RIGHT-TO-LEFT EMBEDDING
0x202C, # POP DIRECTIONAL FORMATTING
0x202D, # LEFT-TO-RIGHT OVERRIDE
0x202E, # RIGHT-TO-LEFT OVERRIDE
0x2060, # WORD JOINER
0x2061, # FUNCTION APPLICATION
0x2062, # INVISIBLE TIMES
0x2063, # INVISIBLE SEPARATOR
0x2064, # INVISIBLE PLUS
]
)
def classify_codepoint(codepoint: int) -> str:
"""Classify a Unicode codepoint into one of our glyph categories.
Args:
codepoint: Unicode codepoint (0x0000 to 0xFFFF)
Returns:
Glyph name to map this codepoint to
"""
# Special cases first
if codepoint == 0x0020:
return "space"
if codepoint == 0x00A0:
return "nbspace"
if codepoint in ZERO_WIDTH_CHARS:
return "blank0"
# Use Unicode properties for the rest
char = chr(codepoint)
try:
category = unicodedata.category(char)
east_asian_width = unicodedata.east_asian_width(char)
# Combining marks are zero-width
if category.startswith("M"):
return "blank0"
# Wide and Fullwidth characters are double-width
if east_asian_width in ("W", "F"):
return "blank2"
# Everything else is regular width
return "blank1"
except (ValueError, TypeError):
# Fallback for any edge cases
return "blank1"
def build_cmap() -> dict[int, str]:
"""Build the Unicode to glyph name mapping for the entire BMP.
Returns:
Dictionary mapping codepoints to glyph names
"""
return {cp: classify_codepoint(cp) for cp in range(0x10000)}
def create_font() -> TTFont:
"""Create the Occulta glyphless font.
Returns:
TTFont object ready to be saved
"""
glyph_names = [g[0] for g in GLYPHS]
# Start building the font
fb = FontBuilder(UNITS_PER_EM, isTTF=True)
fb.setupGlyphOrder(glyph_names)
# Create empty (invisible) glyphs
glyphs = {}
for name, _, _ in GLYPHS:
glyph = Glyph()
glyph.numberOfContours = 0
glyphs[name] = glyph
fb.setupGlyf(glyphs)
# Set up horizontal metrics
metrics = {name: (width, lsb) for name, width, lsb in GLYPHS}
fb.setupHorizontalMetrics(metrics)
# Minimal cmap to satisfy FontBuilder (we'll replace it later)
fb.setupCharacterMap({0x0020: "space", 0x00A0: "nbspace"})
# Set up other required tables
fb.setupHorizontalHeader(ascent=ASCENT, descent=DESCENT)
fb.setupOS2(
sTypoAscender=ASCENT,
sTypoDescender=DESCENT,
sTypoLineGap=0,
usWinAscent=UNITS_PER_EM,
usWinDescent=abs(DESCENT),
sxHeight=500,
sCapHeight=700,
)
import time
# Use current time for font timestamps
now = int(time.time())
fb.setupHead(unitsPerEm=UNITS_PER_EM, created=now, modified=now)
fb.setupPost()
fb.setupNameTable(
{
"familyName": "Occulta",
"styleName": "Regular",
"uniqueFontIdentifier": "OCRmyPDF;Occulta-Regular;2026",
"fullName": "Occulta Regular",
"version": "Version 2.0",
"psName": "Occulta-Regular",
}
)
# Build the font
font = fb.font
# Now replace the cmap with format 13 for efficient many-to-one mapping
char_to_glyph = build_cmap()
cmap13 = CmapSubtable.newSubtable(13)
cmap13.platformID = 3 # Windows
cmap13.platEncID = 10 # Unicode full repertoire
cmap13.language = 0
cmap13.cmap = char_to_glyph
font["cmap"].tables = [cmap13]
return font
def main() -> None:
"""Generate the Occulta font and save it."""
print("Generating Occulta glyphless font...")
font = create_font()
# Create output directory if needed
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
# Save the font
font.save(str(OUTPUT_PATH))
font.close()
# Report statistics
size = OUTPUT_PATH.stat().st_size
print(f"Saved to: {OUTPUT_PATH}")
print(f"Size: {size:,} bytes")
# Verify cmap
font = TTFont(str(OUTPUT_PATH))
for table in font["cmap"].tables:
print(
f"cmap: Platform {table.platformID}, "
f"Encoding {table.platEncID}, "
f"Format {table.format}, "
f"{len(table.cmap)} mappings"
)
font.close()
print("Done!")
if __name__ == "__main__":
main()