mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-02-08 05:13:50 -05:00
Comprehensive test coverage for the new hocrtransform components: - test_ocr_element.py: Tests for BoundingBox, Baseline, FontInfo, OcrElement dataclass methods (iter_by_class, find_by_class, get_text_recursive, words/lines/paragraphs properties) - test_hocr_parser.py: Tests for parsing hOCR files including page/paragraph/line/word extraction, RTL text, rotated text, different line types (header, caption), font info, and edge cases - test_pdf_renderer.py: Tests for PDF rendering including text extraction verification, page sizing, multi-line content, text direction, baseline handling, textangle rotation, word breaks, debug options, and image overlay Also fixes x_font regex pattern to not capture trailing semicolons. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
245 lines
8.3 KiB
Python
245 lines
8.3 KiB
Python
# SPDX-FileCopyrightText: 2025 James R. Barlow
|
|
# SPDX-License-Identifier: MPL-2.0
|
|
|
|
"""Unit tests for OcrElement dataclass and related classes."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from ocrmypdf.hocrtransform import (
|
|
Baseline,
|
|
BoundingBox,
|
|
FontInfo,
|
|
OcrClass,
|
|
OcrElement,
|
|
)
|
|
|
|
|
|
class TestBoundingBox:
|
|
"""Tests for BoundingBox dataclass."""
|
|
|
|
def test_basic_creation(self):
|
|
bbox = BoundingBox(left=10, top=20, right=100, bottom=50)
|
|
assert bbox.left == 10
|
|
assert bbox.top == 20
|
|
assert bbox.right == 100
|
|
assert bbox.bottom == 50
|
|
|
|
def test_width_height(self):
|
|
bbox = BoundingBox(left=10, top=20, right=110, bottom=70)
|
|
assert bbox.width == 100
|
|
assert bbox.height == 50
|
|
|
|
def test_zero_size_box(self):
|
|
bbox = BoundingBox(left=10, top=20, right=10, bottom=20)
|
|
assert bbox.width == 0
|
|
assert bbox.height == 0
|
|
|
|
def test_invalid_left_right(self):
|
|
with pytest.raises(ValueError, match="right.*left"):
|
|
BoundingBox(left=100, top=20, right=10, bottom=50)
|
|
|
|
def test_invalid_top_bottom(self):
|
|
with pytest.raises(ValueError, match="bottom.*top"):
|
|
BoundingBox(left=10, top=50, right=100, bottom=20)
|
|
|
|
|
|
class TestBaseline:
|
|
"""Tests for Baseline dataclass."""
|
|
|
|
def test_defaults(self):
|
|
baseline = Baseline()
|
|
assert baseline.slope == 0.0
|
|
assert baseline.intercept == 0.0
|
|
|
|
def test_with_values(self):
|
|
baseline = Baseline(slope=0.01, intercept=-5)
|
|
assert baseline.slope == 0.01
|
|
assert baseline.intercept == -5
|
|
|
|
|
|
class TestFontInfo:
|
|
"""Tests for FontInfo dataclass."""
|
|
|
|
def test_defaults(self):
|
|
font = FontInfo()
|
|
assert font.name is None
|
|
assert font.size is None
|
|
assert font.bold is False
|
|
assert font.italic is False
|
|
|
|
def test_with_values(self):
|
|
font = FontInfo(name="Arial", size=12.0, bold=True)
|
|
assert font.name == "Arial"
|
|
assert font.size == 12.0
|
|
assert font.bold is True
|
|
assert font.italic is False
|
|
|
|
|
|
class TestOcrElement:
|
|
"""Tests for OcrElement dataclass."""
|
|
|
|
def test_minimal_element(self):
|
|
elem = OcrElement(ocr_class=OcrClass.WORD, text="hello")
|
|
assert elem.ocr_class == "ocrx_word"
|
|
assert elem.text == "hello"
|
|
assert elem.bbox is None
|
|
assert elem.children == []
|
|
|
|
def test_element_with_bbox(self):
|
|
bbox = BoundingBox(left=0, top=0, right=100, bottom=50)
|
|
elem = OcrElement(ocr_class=OcrClass.LINE, bbox=bbox)
|
|
assert elem.bbox == bbox
|
|
assert elem.bbox.width == 100
|
|
|
|
def test_element_hierarchy(self):
|
|
word1 = OcrElement(ocr_class=OcrClass.WORD, text="Hello")
|
|
word2 = OcrElement(ocr_class=OcrClass.WORD, text="World")
|
|
line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])
|
|
paragraph = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line])
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[paragraph])
|
|
|
|
assert len(page.children) == 1
|
|
assert len(page.children[0].children) == 1
|
|
assert len(page.children[0].children[0].children) == 2
|
|
|
|
def test_iter_by_class_single(self):
|
|
word = OcrElement(ocr_class=OcrClass.WORD, text="test")
|
|
line = OcrElement(ocr_class=OcrClass.LINE, children=[word])
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
|
|
|
|
words = page.iter_by_class(OcrClass.WORD)
|
|
assert len(words) == 1
|
|
assert words[0].text == "test"
|
|
|
|
def test_iter_by_class_multiple(self):
|
|
words = [
|
|
OcrElement(ocr_class=OcrClass.WORD, text="one"),
|
|
OcrElement(ocr_class=OcrClass.WORD, text="two"),
|
|
OcrElement(ocr_class=OcrClass.WORD, text="three"),
|
|
]
|
|
line = OcrElement(ocr_class=OcrClass.LINE, children=words)
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
|
|
|
|
result = page.iter_by_class(OcrClass.WORD)
|
|
assert len(result) == 3
|
|
assert [w.text for w in result] == ["one", "two", "three"]
|
|
|
|
def test_iter_by_class_multiple_types(self):
|
|
line = OcrElement(ocr_class=OcrClass.LINE)
|
|
header = OcrElement(ocr_class=OcrClass.HEADER)
|
|
caption = OcrElement(ocr_class=OcrClass.CAPTION)
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line, header, caption])
|
|
|
|
result = page.iter_by_class(OcrClass.LINE, OcrClass.HEADER)
|
|
assert len(result) == 2
|
|
|
|
def test_find_by_class(self):
|
|
word = OcrElement(ocr_class=OcrClass.WORD, text="found")
|
|
line = OcrElement(ocr_class=OcrClass.LINE, children=[word])
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
|
|
|
|
result = page.find_by_class(OcrClass.WORD)
|
|
assert result is not None
|
|
assert result.text == "found"
|
|
|
|
def test_find_by_class_not_found(self):
|
|
line = OcrElement(ocr_class=OcrClass.LINE)
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
|
|
|
|
result = page.find_by_class(OcrClass.WORD)
|
|
assert result is None
|
|
|
|
def test_get_text_recursive_leaf(self):
|
|
word = OcrElement(ocr_class=OcrClass.WORD, text="hello")
|
|
assert word.get_text_recursive() == "hello"
|
|
|
|
def test_get_text_recursive_nested(self):
|
|
word1 = OcrElement(ocr_class=OcrClass.WORD, text="Hello")
|
|
word2 = OcrElement(ocr_class=OcrClass.WORD, text="World")
|
|
line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])
|
|
|
|
assert line.get_text_recursive() == "Hello World"
|
|
|
|
def test_words_property(self):
|
|
words = [
|
|
OcrElement(ocr_class=OcrClass.WORD, text="a"),
|
|
OcrElement(ocr_class=OcrClass.WORD, text="b"),
|
|
]
|
|
line = OcrElement(ocr_class=OcrClass.LINE, children=words)
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
|
|
|
|
assert len(page.words) == 2
|
|
assert page.words[0].text == "a"
|
|
|
|
def test_lines_property(self):
|
|
line1 = OcrElement(ocr_class=OcrClass.LINE)
|
|
line2 = OcrElement(ocr_class=OcrClass.HEADER) # Also a line type
|
|
par = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line1, line2])
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[par])
|
|
|
|
assert len(page.lines) == 2
|
|
|
|
def test_paragraphs_property(self):
|
|
par1 = OcrElement(ocr_class=OcrClass.PARAGRAPH)
|
|
par2 = OcrElement(ocr_class=OcrClass.PARAGRAPH)
|
|
page = OcrElement(ocr_class=OcrClass.PAGE, children=[par1, par2])
|
|
|
|
assert len(page.paragraphs) == 2
|
|
|
|
def test_direction_ltr(self):
|
|
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction="ltr")
|
|
assert elem.direction == "ltr"
|
|
|
|
def test_direction_rtl(self):
|
|
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction="rtl")
|
|
assert elem.direction == "rtl"
|
|
|
|
def test_language(self):
|
|
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, language="eng")
|
|
assert elem.language == "eng"
|
|
|
|
def test_baseline(self):
|
|
baseline = Baseline(slope=0.01, intercept=-3)
|
|
elem = OcrElement(ocr_class=OcrClass.LINE, baseline=baseline)
|
|
assert elem.baseline.slope == 0.01
|
|
assert elem.baseline.intercept == -3
|
|
|
|
def test_textangle(self):
|
|
elem = OcrElement(ocr_class=OcrClass.LINE, textangle=5.0)
|
|
assert elem.textangle == 5.0
|
|
|
|
def test_confidence(self):
|
|
elem = OcrElement(ocr_class=OcrClass.WORD, confidence=0.95)
|
|
assert elem.confidence == 0.95
|
|
|
|
def test_page_properties(self):
|
|
elem = OcrElement(
|
|
ocr_class=OcrClass.PAGE,
|
|
dpi=300.0,
|
|
page_number=0,
|
|
logical_page_number=1,
|
|
)
|
|
assert elem.dpi == 300.0
|
|
assert elem.page_number == 0
|
|
assert elem.logical_page_number == 1
|
|
|
|
|
|
class TestOcrClass:
|
|
"""Tests for OcrClass constants."""
|
|
|
|
def test_class_values(self):
|
|
assert OcrClass.PAGE == "ocr_page"
|
|
assert OcrClass.PARAGRAPH == "ocr_par"
|
|
assert OcrClass.LINE == "ocr_line"
|
|
assert OcrClass.WORD == "ocrx_word"
|
|
assert OcrClass.HEADER == "ocr_header"
|
|
assert OcrClass.CAPTION == "ocr_caption"
|
|
|
|
def test_line_types_frozenset(self):
|
|
assert OcrClass.LINE in OcrClass.LINE_TYPES
|
|
assert OcrClass.HEADER in OcrClass.LINE_TYPES
|
|
assert OcrClass.CAPTION in OcrClass.LINE_TYPES
|
|
assert OcrClass.WORD not in OcrClass.LINE_TYPES
|