mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 03:26:17 -04:00
Restructure hocrtransform submodule to avoid having everything in __init__
This commit is contained in:
@@ -1,7 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-FileCopyrightText: 2010 Jonathan Brinley
|
||||
# SPDX-FileCopyrightText: 2013-2014 Julien Pfefferkorn
|
||||
# SPDX-FileCopyrightText: 2022 James R. Barlow
|
||||
# SPDX-FileCopyrightText: 2023 James R. Barlow
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""Transform .hocr and page image to text PDF."""
|
||||
@@ -9,432 +6,18 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
from dataclasses import dataclass
|
||||
from itertools import pairwise
|
||||
from math import atan, cos, pi
|
||||
from pathlib import Path
|
||||
from typing import Any, NamedTuple
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from pikepdf import Matrix, Rectangle
|
||||
|
||||
from ocrmypdf.hocrtransform._canvas import PikepdfCanvas as Canvas
|
||||
from ocrmypdf.hocrtransform._canvas import PikepdfText
|
||||
from ocrmypdf.hocrtransform.color import (
|
||||
BLACK,
|
||||
BLUE,
|
||||
CYAN,
|
||||
DARKGREEN,
|
||||
GREEN,
|
||||
MAGENTA,
|
||||
RED,
|
||||
from ocrmypdf.hocrtransform._hocr import (
|
||||
DebugRenderOptions,
|
||||
HocrTransform,
|
||||
HocrTransformError,
|
||||
)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
INCH = 72.0
|
||||
|
||||
Element = ElementTree.Element
|
||||
|
||||
|
||||
@dataclass
|
||||
class DebugRenderOptions:
|
||||
"""A class for managing rendering options."""
|
||||
|
||||
render_paragraph_bbox: bool
|
||||
render_baseline: bool
|
||||
render_triangle: bool
|
||||
render_line_bbox: bool
|
||||
render_word_bbox: bool
|
||||
render_space_bbox: bool
|
||||
|
||||
|
||||
class HocrTransformError(Exception):
|
||||
"""Error while applying hOCR transform."""
|
||||
|
||||
|
||||
class HocrTransform:
|
||||
"""A class for converting documents from the hOCR format.
|
||||
|
||||
For details of the hOCR format, see:
|
||||
http://kba.cloud/hocr-spec/.
|
||||
"""
|
||||
|
||||
box_pattern = re.compile(r'bbox (\d+) (\d+) (\d+) (\d+)')
|
||||
baseline_pattern = re.compile(
|
||||
r'''
|
||||
baseline \s+
|
||||
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
|
||||
([\-\+]?\d+) # +/- int''',
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
def __init__(self, *, hocr_filename: str | Path, dpi: float):
|
||||
"""Initialize the HocrTransform object."""
|
||||
self.dpi = dpi
|
||||
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
|
||||
|
||||
# if the hOCR file has a namespace, ElementTree requires its use to
|
||||
# find elements
|
||||
matches = re.match(r'({.*})html', self.hocr.getroot().tag)
|
||||
self.xmlns = ''
|
||||
if matches:
|
||||
self.xmlns = matches.group(1)
|
||||
|
||||
self.width, self.height = None, None
|
||||
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')):
|
||||
coords = self.element_coordinates(div)
|
||||
self.width = (coords.urx - coords.llx) / (self.dpi / INCH)
|
||||
self.height = (coords.ury - coords.lly) / (self.dpi / INCH)
|
||||
# there shouldn't be more than one, and if there is, we don't want
|
||||
# it
|
||||
break
|
||||
if self.width is None or self.height is None:
|
||||
raise HocrTransformError("hocr file is missing page dimensions")
|
||||
self.render_options = DebugRenderOptions(
|
||||
render_baseline=False,
|
||||
render_triangle=False,
|
||||
render_line_bbox=False,
|
||||
render_word_bbox=True,
|
||||
render_paragraph_bbox=False,
|
||||
render_space_bbox=True,
|
||||
)
|
||||
|
||||
def _get_element_text(self, element: Element):
|
||||
"""Return the textual content of the element and its children."""
|
||||
text = ''
|
||||
if element.text is not None:
|
||||
text += element.text
|
||||
for child in element:
|
||||
text += self._get_element_text(child)
|
||||
if element.tail is not None:
|
||||
text += element.tail
|
||||
return text
|
||||
|
||||
@classmethod
|
||||
def element_coordinates(cls, element: Element) -> Rectangle | None:
|
||||
"""Get coordinates of the bounding box around an element."""
|
||||
if 'title' in element.attrib:
|
||||
matches = cls.box_pattern.search(element.attrib['title'])
|
||||
if matches:
|
||||
return Rectangle(
|
||||
float(matches.group(1)), # llx = left
|
||||
float(matches.group(2)), # lly = top
|
||||
float(matches.group(3)), # urx = right
|
||||
float(matches.group(4)), # ury = bottom
|
||||
)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def baseline(cls, element: Element) -> tuple[float, float]:
|
||||
"""Get baseline's slope and intercept."""
|
||||
if 'title' in element.attrib:
|
||||
matches = cls.baseline_pattern.search(element.attrib['title'])
|
||||
if matches:
|
||||
return float(matches.group(1)), int(matches.group(2))
|
||||
return (0.0, 0.0)
|
||||
|
||||
def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
|
||||
xpath = f".//{self.xmlns}{html_tag}"
|
||||
if html_class:
|
||||
xpath += f"[@class='{html_class}']"
|
||||
return xpath
|
||||
|
||||
@classmethod
|
||||
def normalize_text(cls, s: str) -> str:
|
||||
"""Normalize the given text using the NFKC normalization form."""
|
||||
return unicodedata.normalize("NFKC", s)
|
||||
|
||||
def to_pdf(
|
||||
self,
|
||||
*,
|
||||
out_filename: Path,
|
||||
image_filename: Path | None = None,
|
||||
fontname: str = "Helvetica",
|
||||
invisible_text: bool = False,
|
||||
) -> None:
|
||||
"""Creates a PDF file with an image superimposed on top of the text.
|
||||
|
||||
Text is positioned according to the bounding box of the lines in
|
||||
the hOCR file.
|
||||
The image need not be identical to the image used to create the hOCR
|
||||
file.
|
||||
It can have a lower resolution, different color mode, etc.
|
||||
|
||||
Arguments:
|
||||
out_filename: Path of PDF to write.
|
||||
image_filename: Image to use for this file. If omitted, the OCR text
|
||||
is shown.
|
||||
fontname: Name of font to use.
|
||||
invisible_text: If True, text is rendered invisible so that is
|
||||
selectable but never drawn. If False, text is visible and may
|
||||
be seen if the image is skipped or deleted in Acrobat.
|
||||
"""
|
||||
# create the PDF file
|
||||
# page size in points (1/72 in.)
|
||||
canvas = Canvas(
|
||||
out_filename,
|
||||
page_size=(self.width, self.height),
|
||||
)
|
||||
canvas.push()
|
||||
page_matrix = (
|
||||
Matrix()
|
||||
.translated(0, self.height)
|
||||
.scaled(1, -1)
|
||||
.scaled(INCH / self.dpi, INCH / self.dpi)
|
||||
)
|
||||
canvas.cm(page_matrix)
|
||||
log.debug(page_matrix)
|
||||
|
||||
self._debug_draw_paragraph_boxes(canvas)
|
||||
|
||||
found_lines = False
|
||||
for line in (
|
||||
element
|
||||
for element in self.hocr.iterfind(self._child_xpath('span'))
|
||||
if 'class' in element.attrib
|
||||
and element.attrib['class'] in {'ocr_header', 'ocr_line', 'ocr_textfloat'}
|
||||
):
|
||||
found_lines = True
|
||||
self._do_line(
|
||||
canvas,
|
||||
line,
|
||||
"ocrx_word",
|
||||
fontname,
|
||||
invisible_text,
|
||||
)
|
||||
|
||||
if not found_lines:
|
||||
# Tesseract did not report any lines (just words)
|
||||
root = self.hocr.find(self._child_xpath('div', 'ocr_page'))
|
||||
self._do_line(
|
||||
canvas,
|
||||
root,
|
||||
"ocrx_word",
|
||||
fontname,
|
||||
invisible_text,
|
||||
)
|
||||
canvas.pop()
|
||||
# put the image on the page, scaled to fill the page
|
||||
if image_filename is not None:
|
||||
canvas.draw_image(
|
||||
image_filename, 0, 0, width=self.width, height=self.height
|
||||
)
|
||||
|
||||
# finish up the page and save it
|
||||
canvas.save()
|
||||
|
||||
@classmethod
|
||||
def polyval(cls, poly, x): # pragma: no cover
|
||||
"""Calculate the value of a polynomial at a point."""
|
||||
return x * poly[0] + poly[1]
|
||||
|
||||
def _do_line(
|
||||
self,
|
||||
canvas: Canvas,
|
||||
line: Element | None,
|
||||
elemclass: str,
|
||||
fontname: str,
|
||||
invisible_text: bool,
|
||||
):
|
||||
"""Render the text for a given line.
|
||||
|
||||
The canvas's coordinate system must be configured so that hOCR pixel
|
||||
coordinates are mapped to PDF coordinates.
|
||||
"""
|
||||
if line is None:
|
||||
return
|
||||
line_box = self.element_coordinates(line)
|
||||
assert line_box.ury > line_box.lly # lly is top, ury is bottom
|
||||
|
||||
self._debug_draw_line_bbox(canvas, line_box)
|
||||
|
||||
# Baseline is a polynomial (usually straight line) that describes the
|
||||
# text baseline relative to the bottom left corner of the line bounding
|
||||
# box.
|
||||
bottom_left_corner = line_box.llx, line_box.ury
|
||||
slope, intercept = self.baseline(line)
|
||||
if abs(slope) < 0.005:
|
||||
slope = 0.0
|
||||
angle = atan(slope)
|
||||
|
||||
# Setup a new coordinate system on the line box's intercept and rotated by
|
||||
# its slope.
|
||||
canvas.push()
|
||||
line_matrix = (
|
||||
Matrix()
|
||||
.translated(*bottom_left_corner)
|
||||
.translated(0, intercept)
|
||||
.rotated(angle / pi * 180)
|
||||
)
|
||||
canvas.cm(line_matrix)
|
||||
log.debug(line_matrix)
|
||||
text = canvas.begin_text()
|
||||
|
||||
# Don't allow the font to break out of the bounding box. Division by
|
||||
# cos_a accounts for extra clearance between the glyph's vertical axis
|
||||
# on a sloped baseline and the edge of the bounding box.
|
||||
line_box_height = abs(line_box.height) / cos(angle)
|
||||
fontsize = line_box_height + intercept
|
||||
text.set_font(fontname, fontsize)
|
||||
if invisible_text or True:
|
||||
text.set_render_mode(3) # Invisible (indicates OCR text)
|
||||
|
||||
self._debug_draw_baseline(canvas, line_matrix.inverse().transform(line_box), 0)
|
||||
|
||||
canvas.set_fill_color(BLACK) # text in black
|
||||
elements = line.findall(self._child_xpath('span', elemclass))
|
||||
for elem, next_elem in pairwise(elements + [None]):
|
||||
self._do_line_word(
|
||||
canvas,
|
||||
fontname,
|
||||
line_matrix,
|
||||
text,
|
||||
fontsize,
|
||||
elem,
|
||||
next_elem,
|
||||
)
|
||||
canvas.draw_text(text)
|
||||
canvas.pop()
|
||||
|
||||
def _do_line_word(
|
||||
self,
|
||||
canvas: Canvas,
|
||||
fontname,
|
||||
line_matrix: Matrix,
|
||||
text: PikepdfText,
|
||||
fontsize: float,
|
||||
elem: Element,
|
||||
next_elem: Element | None,
|
||||
):
|
||||
"""Render the text for a single word."""
|
||||
elemtxt = self.normalize_text(self._get_element_text(elem).strip())
|
||||
if elemtxt == '':
|
||||
return
|
||||
|
||||
hocr_box = self.element_coordinates(elem)
|
||||
if hocr_box is None:
|
||||
return
|
||||
box = line_matrix.inverse().transform(hocr_box)
|
||||
font_width = canvas.string_width(elemtxt, fontname, fontsize)
|
||||
|
||||
# Debug sketches
|
||||
self._debug_draw_word_triangle(canvas, box)
|
||||
self._debug_draw_word_bbox(canvas, box)
|
||||
|
||||
# If this word is 0 units wide, our best bet seems to be to suppress this text
|
||||
if font_width > 0:
|
||||
text.set_text_transform(Matrix(1, 0, 0, 1, box.llx, 0))
|
||||
text.set_horiz_scale(100 * box.width / font_width)
|
||||
text.show(elemtxt)
|
||||
|
||||
# Get coordinates of the next word (if there is one)
|
||||
hocr_next_box = (
|
||||
self.element_coordinates(next_elem) if next_elem is not None else None
|
||||
)
|
||||
if hocr_next_box is not None:
|
||||
# Render a space this word and the next word. The explicit space helps
|
||||
# PDF viewers identify the word break, and horizontally scaling it to
|
||||
# occupy the space the between the words helps the PDF viewer
|
||||
# avoid combiningthewordstogether.
|
||||
next_box = line_matrix.inverse().transform(hocr_next_box)
|
||||
space_box = Rectangle(box.urx, box.lly, next_box.llx, next_box.ury)
|
||||
self._debug_draw_space_bbox(canvas, space_box)
|
||||
text.set_text_transform(Matrix(1, 0, 0, 1, space_box.llx, 0))
|
||||
space_width = canvas.string_width(' ', fontname, fontsize)
|
||||
space_box_width = space_box.urx - space_box.llx
|
||||
text.set_horiz_scale(100 * space_box_width / space_width)
|
||||
text.show(' ')
|
||||
|
||||
def _debug_draw_paragraph_boxes(self, canvas: Canvas, color=CYAN):
|
||||
"""Draw boxes around paragraphs in the document."""
|
||||
if not self.render_options.render_paragraph_bbox: # pragma: no cover
|
||||
return
|
||||
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
|
||||
elemtxt = self._get_element_text(elem).rstrip()
|
||||
if len(elemtxt) == 0:
|
||||
continue
|
||||
ocr_par = self.element_coordinates(elem)
|
||||
# draw box around paragraph
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(0.1) # no line for bounding box
|
||||
canvas.rect(ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=0)
|
||||
|
||||
def _debug_draw_line_bbox(self, canvas: Canvas, line_box, color=BLUE):
|
||||
"""Render the bounding box of a text line."""
|
||||
if not self.render_options.render_line_bbox: # pragma: no cover
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(0.15)
|
||||
canvas.rect(
|
||||
line_box.llx,
|
||||
line_box.lly,
|
||||
line_box.width,
|
||||
line_box.height,
|
||||
fill=0,
|
||||
)
|
||||
canvas.pop()
|
||||
|
||||
def _debug_draw_word_triangle(self, canvas: Canvas, box, color=RED, line_width=0.1):
|
||||
"""Render a triangle that conveys word height and drawing direction."""
|
||||
if not self.render_options.render_triangle: # pragma: no cover
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(line_width)
|
||||
# Draw a triangle that conveys word height and drawing direction
|
||||
canvas.line(box.llx, box.lly, box.urx, box.lly) # across bottom
|
||||
canvas.line(box.urx, box.lly, box.llx, box.ury) # diagonal
|
||||
canvas.line(box.llx, box.lly, box.llx, box.ury) # rise
|
||||
canvas.pop()
|
||||
|
||||
def _debug_draw_word_bbox(self, canvas: Canvas, box, color=GREEN, line_width=0.1):
|
||||
"""Render a box depicting the word."""
|
||||
if not self.render_options.render_word_bbox: # pragma: no cover
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_dashes()
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(line_width)
|
||||
canvas.rect(box.llx, box.lly, box.width, box.height, fill=0)
|
||||
canvas.pop()
|
||||
|
||||
def _debug_draw_space_bbox(
|
||||
self, canvas: Canvas, box, color=DARKGREEN, line_width=0.1
|
||||
):
|
||||
"""Render a box depicting the space between two words."""
|
||||
if not self.render_options.render_space_bbox: # pragma: no cover
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_dashes()
|
||||
canvas.set_fill_color(color)
|
||||
canvas.set_line_width(line_width)
|
||||
canvas.rect(box.llx, box.lly, box.width, box.height, fill=1)
|
||||
canvas.pop()
|
||||
|
||||
def _debug_draw_baseline(
|
||||
self, canvas, line_box, baseline_lly, color=MAGENTA, line_width=0.25
|
||||
):
|
||||
"""Render the text baseline."""
|
||||
if not self.render_options.render_baseline:
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_dashes()
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(line_width)
|
||||
canvas.line(
|
||||
line_box.llx,
|
||||
baseline_lly,
|
||||
line_box.urx,
|
||||
baseline_lly,
|
||||
)
|
||||
canvas.pop()
|
||||
|
||||
__all__ = (
|
||||
'HocrTransform',
|
||||
'HocrTransformError',
|
||||
'DebugRenderOptions',
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description='Convert hocr file to PDF')
|
||||
@@ -458,12 +41,6 @@ if __name__ == "__main__":
|
||||
default=None,
|
||||
help='Path to the image to be placed above the text',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--interword-spaces',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Add spaces between words',
|
||||
)
|
||||
parser.add_argument('hocrfile', help='Path to the hocr file to be parsed')
|
||||
parser.add_argument('outputfile', help='Path to the PDF file to be generated')
|
||||
args = parser.parse_args()
|
||||
@@ -473,5 +50,4 @@ if __name__ == "__main__":
|
||||
out_filename=args.outputfile,
|
||||
image_filename=args.image,
|
||||
show_bounding_boxes=args.boundingboxes,
|
||||
interword_spaces=args.interword_spaces,
|
||||
)
|
||||
|
||||
433
src/ocrmypdf/hocrtransform/_hocr.py
Normal file
433
src/ocrmypdf/hocrtransform/_hocr.py
Normal file
@@ -0,0 +1,433 @@
|
||||
# SPDX-FileCopyrightText: 2010 Jonathan Brinley
|
||||
# SPDX-FileCopyrightText: 2013-2014 Julien Pfefferkorn
|
||||
# SPDX-FileCopyrightText: 2023 James R. Barlow
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""hOCR transform implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
from dataclasses import dataclass
|
||||
from itertools import pairwise
|
||||
from math import atan, cos, pi
|
||||
from pathlib import Path
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from pikepdf import Matrix, Rectangle
|
||||
|
||||
from ocrmypdf.hocrtransform._canvas import PikepdfCanvas as Canvas
|
||||
from ocrmypdf.hocrtransform._canvas import PikepdfText
|
||||
from ocrmypdf.hocrtransform.color import (
|
||||
BLACK,
|
||||
BLUE,
|
||||
CYAN,
|
||||
DARKGREEN,
|
||||
GREEN,
|
||||
MAGENTA,
|
||||
RED,
|
||||
)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
INCH = 72.0
|
||||
|
||||
Element = ElementTree.Element
|
||||
|
||||
|
||||
@dataclass
|
||||
class DebugRenderOptions:
|
||||
"""A class for managing rendering options."""
|
||||
|
||||
render_paragraph_bbox: bool
|
||||
render_baseline: bool
|
||||
render_triangle: bool
|
||||
render_line_bbox: bool
|
||||
render_word_bbox: bool
|
||||
render_space_bbox: bool
|
||||
|
||||
|
||||
class HocrTransformError(Exception):
|
||||
"""Error while applying hOCR transform."""
|
||||
|
||||
|
||||
class HocrTransform:
|
||||
"""A class for converting documents from the hOCR format.
|
||||
|
||||
For details of the hOCR format, see:
|
||||
http://kba.cloud/hocr-spec/.
|
||||
"""
|
||||
|
||||
box_pattern = re.compile(r'bbox (\d+) (\d+) (\d+) (\d+)')
|
||||
baseline_pattern = re.compile(
|
||||
r'''
|
||||
baseline \s+
|
||||
([\-\+]?\d*\.?\d*) \s+ # +/- decimal float
|
||||
([\-\+]?\d+) # +/- int''',
|
||||
re.VERBOSE,
|
||||
)
|
||||
|
||||
def __init__(self, *, hocr_filename: str | Path, dpi: float):
|
||||
"""Initialize the HocrTransform object."""
|
||||
self.dpi = dpi
|
||||
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
|
||||
|
||||
# if the hOCR file has a namespace, ElementTree requires its use to
|
||||
# find elements
|
||||
matches = re.match(r'({.*})html', self.hocr.getroot().tag)
|
||||
self.xmlns = ''
|
||||
if matches:
|
||||
self.xmlns = matches.group(1)
|
||||
|
||||
self.width, self.height = None, None
|
||||
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')):
|
||||
coords = self.element_coordinates(div)
|
||||
self.width = (coords.urx - coords.llx) / (self.dpi / INCH)
|
||||
self.height = (coords.ury - coords.lly) / (self.dpi / INCH)
|
||||
# there shouldn't be more than one, and if there is, we don't want
|
||||
# it
|
||||
break
|
||||
if self.width is None or self.height is None:
|
||||
raise HocrTransformError("hocr file is missing page dimensions")
|
||||
self.render_options = DebugRenderOptions(
|
||||
render_baseline=False,
|
||||
render_triangle=False,
|
||||
render_line_bbox=False,
|
||||
render_word_bbox=True,
|
||||
render_paragraph_bbox=False,
|
||||
render_space_bbox=True,
|
||||
)
|
||||
|
||||
def _get_element_text(self, element: Element):
|
||||
"""Return the textual content of the element and its children."""
|
||||
text = ''
|
||||
if element.text is not None:
|
||||
text += element.text
|
||||
for child in element:
|
||||
text += self._get_element_text(child)
|
||||
if element.tail is not None:
|
||||
text += element.tail
|
||||
return text
|
||||
|
||||
@classmethod
|
||||
def element_coordinates(cls, element: Element) -> Rectangle | None:
|
||||
"""Get coordinates of the bounding box around an element."""
|
||||
if 'title' in element.attrib:
|
||||
matches = cls.box_pattern.search(element.attrib['title'])
|
||||
if matches:
|
||||
return Rectangle(
|
||||
float(matches.group(1)), # llx = left
|
||||
float(matches.group(2)), # lly = top
|
||||
float(matches.group(3)), # urx = right
|
||||
float(matches.group(4)), # ury = bottom
|
||||
)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def baseline(cls, element: Element) -> tuple[float, float]:
|
||||
"""Get baseline's slope and intercept."""
|
||||
if 'title' in element.attrib:
|
||||
matches = cls.baseline_pattern.search(element.attrib['title'])
|
||||
if matches:
|
||||
return float(matches.group(1)), int(matches.group(2))
|
||||
return (0.0, 0.0)
|
||||
|
||||
def _child_xpath(self, html_tag: str, html_class: str | None = None) -> str:
|
||||
xpath = f".//{self.xmlns}{html_tag}"
|
||||
if html_class:
|
||||
xpath += f"[@class='{html_class}']"
|
||||
return xpath
|
||||
|
||||
@classmethod
|
||||
def normalize_text(cls, s: str) -> str:
|
||||
"""Normalize the given text using the NFKC normalization form."""
|
||||
return unicodedata.normalize("NFKC", s)
|
||||
|
||||
def to_pdf(
|
||||
self,
|
||||
*,
|
||||
out_filename: Path,
|
||||
image_filename: Path | None = None,
|
||||
fontname: str = "Helvetica",
|
||||
invisible_text: bool = False,
|
||||
) -> None:
|
||||
"""Creates a PDF file with an image superimposed on top of the text.
|
||||
|
||||
Text is positioned according to the bounding box of the lines in
|
||||
the hOCR file.
|
||||
The image need not be identical to the image used to create the hOCR
|
||||
file.
|
||||
It can have a lower resolution, different color mode, etc.
|
||||
|
||||
Arguments:
|
||||
out_filename: Path of PDF to write.
|
||||
image_filename: Image to use for this file. If omitted, the OCR text
|
||||
is shown.
|
||||
fontname: Name of font to use.
|
||||
invisible_text: If True, text is rendered invisible so that is
|
||||
selectable but never drawn. If False, text is visible and may
|
||||
be seen if the image is skipped or deleted in Acrobat.
|
||||
"""
|
||||
# create the PDF file
|
||||
# page size in points (1/72 in.)
|
||||
canvas = Canvas(
|
||||
out_filename,
|
||||
page_size=(self.width, self.height),
|
||||
)
|
||||
canvas.push()
|
||||
page_matrix = (
|
||||
Matrix()
|
||||
.translated(0, self.height)
|
||||
.scaled(1, -1)
|
||||
.scaled(INCH / self.dpi, INCH / self.dpi)
|
||||
)
|
||||
canvas.cm(page_matrix)
|
||||
log.debug(page_matrix)
|
||||
|
||||
self._debug_draw_paragraph_boxes(canvas)
|
||||
|
||||
found_lines = False
|
||||
for line in (
|
||||
element
|
||||
for element in self.hocr.iterfind(self._child_xpath('span'))
|
||||
if 'class' in element.attrib
|
||||
and element.attrib['class'] in {'ocr_header', 'ocr_line', 'ocr_textfloat'}
|
||||
):
|
||||
found_lines = True
|
||||
self._do_line(
|
||||
canvas,
|
||||
line,
|
||||
"ocrx_word",
|
||||
fontname,
|
||||
invisible_text,
|
||||
)
|
||||
|
||||
if not found_lines:
|
||||
# Tesseract did not report any lines (just words)
|
||||
root = self.hocr.find(self._child_xpath('div', 'ocr_page'))
|
||||
self._do_line(
|
||||
canvas,
|
||||
root,
|
||||
"ocrx_word",
|
||||
fontname,
|
||||
invisible_text,
|
||||
)
|
||||
canvas.pop()
|
||||
# put the image on the page, scaled to fill the page
|
||||
if image_filename is not None:
|
||||
canvas.draw_image(
|
||||
image_filename, 0, 0, width=self.width, height=self.height
|
||||
)
|
||||
|
||||
# finish up the page and save it
|
||||
canvas.save()
|
||||
|
||||
@classmethod
|
||||
def polyval(cls, poly, x): # pragma: no cover
|
||||
"""Calculate the value of a polynomial at a point."""
|
||||
return x * poly[0] + poly[1]
|
||||
|
||||
def _do_line(
|
||||
self,
|
||||
canvas: Canvas,
|
||||
line: Element | None,
|
||||
elemclass: str,
|
||||
fontname: str,
|
||||
invisible_text: bool,
|
||||
):
|
||||
"""Render the text for a given line.
|
||||
|
||||
The canvas's coordinate system must be configured so that hOCR pixel
|
||||
coordinates are mapped to PDF coordinates.
|
||||
"""
|
||||
if line is None:
|
||||
return
|
||||
line_box = self.element_coordinates(line)
|
||||
assert line_box.ury > line_box.lly # lly is top, ury is bottom
|
||||
|
||||
self._debug_draw_line_bbox(canvas, line_box)
|
||||
|
||||
# Baseline is a polynomial (usually straight line) that describes the
|
||||
# text baseline relative to the bottom left corner of the line bounding
|
||||
# box.
|
||||
bottom_left_corner = line_box.llx, line_box.ury
|
||||
slope, intercept = self.baseline(line)
|
||||
if abs(slope) < 0.005:
|
||||
slope = 0.0
|
||||
angle = atan(slope)
|
||||
|
||||
# Setup a new coordinate system on the line box's intercept and rotated by
|
||||
# its slope.
|
||||
canvas.push()
|
||||
line_matrix = (
|
||||
Matrix()
|
||||
.translated(*bottom_left_corner)
|
||||
.translated(0, intercept)
|
||||
.rotated(angle / pi * 180)
|
||||
)
|
||||
canvas.cm(line_matrix)
|
||||
log.debug(line_matrix)
|
||||
text = canvas.begin_text()
|
||||
|
||||
# Don't allow the font to break out of the bounding box. Division by
|
||||
# cos_a accounts for extra clearance between the glyph's vertical axis
|
||||
# on a sloped baseline and the edge of the bounding box.
|
||||
line_box_height = abs(line_box.height) / cos(angle)
|
||||
fontsize = line_box_height + intercept
|
||||
text.set_font(fontname, fontsize)
|
||||
if invisible_text or True:
|
||||
text.set_render_mode(3) # Invisible (indicates OCR text)
|
||||
|
||||
self._debug_draw_baseline(canvas, line_matrix.inverse().transform(line_box), 0)
|
||||
|
||||
canvas.set_fill_color(BLACK) # text in black
|
||||
elements = line.findall(self._child_xpath('span', elemclass))
|
||||
for elem, next_elem in pairwise(elements + [None]):
|
||||
self._do_line_word(
|
||||
canvas,
|
||||
fontname,
|
||||
line_matrix,
|
||||
text,
|
||||
fontsize,
|
||||
elem,
|
||||
next_elem,
|
||||
)
|
||||
canvas.draw_text(text)
|
||||
canvas.pop()
|
||||
|
||||
def _do_line_word(
|
||||
self,
|
||||
canvas: Canvas,
|
||||
fontname,
|
||||
line_matrix: Matrix,
|
||||
text: PikepdfText,
|
||||
fontsize: float,
|
||||
elem: Element,
|
||||
next_elem: Element | None,
|
||||
):
|
||||
"""Render the text for a single word."""
|
||||
elemtxt = self.normalize_text(self._get_element_text(elem).strip())
|
||||
if elemtxt == '':
|
||||
return
|
||||
|
||||
hocr_box = self.element_coordinates(elem)
|
||||
if hocr_box is None:
|
||||
return
|
||||
box = line_matrix.inverse().transform(hocr_box)
|
||||
font_width = canvas.string_width(elemtxt, fontname, fontsize)
|
||||
|
||||
# Debug sketches
|
||||
self._debug_draw_word_triangle(canvas, box)
|
||||
self._debug_draw_word_bbox(canvas, box)
|
||||
|
||||
# If this word is 0 units wide, our best bet seems to be to suppress this text
|
||||
if font_width > 0:
|
||||
text.set_text_transform(Matrix(1, 0, 0, 1, box.llx, 0))
|
||||
text.set_horiz_scale(100 * box.width / font_width)
|
||||
text.show(elemtxt)
|
||||
|
||||
# Get coordinates of the next word (if there is one)
|
||||
hocr_next_box = (
|
||||
self.element_coordinates(next_elem) if next_elem is not None else None
|
||||
)
|
||||
if hocr_next_box is not None:
|
||||
# Render a space this word and the next word. The explicit space helps
|
||||
# PDF viewers identify the word break, and horizontally scaling it to
|
||||
# occupy the space the between the words helps the PDF viewer
|
||||
# avoid combiningthewordstogether.
|
||||
next_box = line_matrix.inverse().transform(hocr_next_box)
|
||||
space_box = Rectangle(box.urx, box.lly, next_box.llx, next_box.ury)
|
||||
self._debug_draw_space_bbox(canvas, space_box)
|
||||
text.set_text_transform(Matrix(1, 0, 0, 1, space_box.llx, 0))
|
||||
space_width = canvas.string_width(' ', fontname, fontsize)
|
||||
space_box_width = space_box.urx - space_box.llx
|
||||
text.set_horiz_scale(100 * space_box_width / space_width)
|
||||
text.show(' ')
|
||||
|
||||
def _debug_draw_paragraph_boxes(self, canvas: Canvas, color=CYAN):
|
||||
"""Draw boxes around paragraphs in the document."""
|
||||
if not self.render_options.render_paragraph_bbox: # pragma: no cover
|
||||
return
|
||||
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
|
||||
elemtxt = self._get_element_text(elem).rstrip()
|
||||
if len(elemtxt) == 0:
|
||||
continue
|
||||
ocr_par = self.element_coordinates(elem)
|
||||
# draw box around paragraph
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(0.1) # no line for bounding box
|
||||
canvas.rect(ocr_par.llx, ocr_par.lly, ocr_par.width, ocr_par.height, fill=0)
|
||||
|
||||
def _debug_draw_line_bbox(self, canvas: Canvas, line_box, color=BLUE):
|
||||
"""Render the bounding box of a text line."""
|
||||
if not self.render_options.render_line_bbox: # pragma: no cover
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(0.15)
|
||||
canvas.rect(
|
||||
line_box.llx,
|
||||
line_box.lly,
|
||||
line_box.width,
|
||||
line_box.height,
|
||||
fill=0,
|
||||
)
|
||||
canvas.pop()
|
||||
|
||||
def _debug_draw_word_triangle(self, canvas: Canvas, box, color=RED, line_width=0.1):
|
||||
"""Render a triangle that conveys word height and drawing direction."""
|
||||
if not self.render_options.render_triangle: # pragma: no cover
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(line_width)
|
||||
# Draw a triangle that conveys word height and drawing direction
|
||||
canvas.line(box.llx, box.lly, box.urx, box.lly) # across bottom
|
||||
canvas.line(box.urx, box.lly, box.llx, box.ury) # diagonal
|
||||
canvas.line(box.llx, box.lly, box.llx, box.ury) # rise
|
||||
canvas.pop()
|
||||
|
||||
def _debug_draw_word_bbox(self, canvas: Canvas, box, color=GREEN, line_width=0.1):
|
||||
"""Render a box depicting the word."""
|
||||
if not self.render_options.render_word_bbox: # pragma: no cover
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_dashes()
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(line_width)
|
||||
canvas.rect(box.llx, box.lly, box.width, box.height, fill=0)
|
||||
canvas.pop()
|
||||
|
||||
def _debug_draw_space_bbox(
|
||||
self, canvas: Canvas, box, color=DARKGREEN, line_width=0.1
|
||||
):
|
||||
"""Render a box depicting the space between two words."""
|
||||
if not self.render_options.render_space_bbox: # pragma: no cover
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_dashes()
|
||||
canvas.set_fill_color(color)
|
||||
canvas.set_line_width(line_width)
|
||||
canvas.rect(box.llx, box.lly, box.width, box.height, fill=1)
|
||||
canvas.pop()
|
||||
|
||||
def _debug_draw_baseline(
|
||||
self, canvas, line_box, baseline_lly, color=MAGENTA, line_width=0.25
|
||||
):
|
||||
"""Render the text baseline."""
|
||||
if not self.render_options.render_baseline:
|
||||
return
|
||||
canvas.push()
|
||||
canvas.set_dashes()
|
||||
canvas.set_stroke_color(color)
|
||||
canvas.set_line_width(line_width)
|
||||
canvas.line(
|
||||
line_box.llx,
|
||||
baseline_lly,
|
||||
line_box.urx,
|
||||
baseline_lly,
|
||||
)
|
||||
canvas.pop()
|
||||
Reference in New Issue
Block a user