mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-02-08 05:13:50 -05:00
Introduce --ocr-engine option to select between OCR engines: - 'auto' (default): Uses Tesseract - 'tesseract': Explicit Tesseract selection - 'none': Skip OCR entirely (for PDF processing only) Key changes: - Extend OcrEngine ABC with generate_ocr() and supports_generate_ocr() for direct OcrElement tree output (bypasses hOCR) - Add get_ocr_engine(options) hook parameter for engine selection - Implement NullOcrEngine for --ocr-engine none - Export OcrElement, OcrClass, BoundingBox from ocrmypdf package - Add ocr_tree support to grafting pipeline This prepares the foundation for pluggable OCR engines while maintaining full backward compatibility with existing Tesseract-based workflows.
170 lines
5.7 KiB
Python
170 lines
5.7 KiB
Python
# SPDX-FileCopyrightText: 2025 James R. Barlow
|
|
# SPDX-License-Identifier: MPL-2.0
|
|
|
|
"""Unit tests for NullOcrEngine (--ocr-engine none).
|
|
|
|
Tests verify that the Null OCR engine exists and functions correctly
|
|
for scenarios where users want PDF processing without OCR.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock
|
|
|
|
import pytest
|
|
|
|
|
|
class TestNullOcrEngineExists:
|
|
"""Test that NullOcrEngine plugin exists and is loadable."""
|
|
|
|
def test_null_ocr_module_importable(self):
|
|
"""null_ocr module should be importable."""
|
|
from ocrmypdf.builtin_plugins import null_ocr
|
|
|
|
assert null_ocr is not None
|
|
|
|
def test_null_ocr_engine_class_exists(self):
|
|
"""NullOcrEngine class should exist."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
assert NullOcrEngine is not None
|
|
|
|
|
|
class TestNullOcrEngineInterface:
|
|
"""Test NullOcrEngine implements OcrEngine interface."""
|
|
|
|
def test_version_returns_none(self):
|
|
"""NullOcrEngine.version() should return 'none'."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
assert NullOcrEngine.version() == "none"
|
|
|
|
def test_creator_tag(self):
|
|
"""NullOcrEngine.creator_tag() should indicate no OCR."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
tag = NullOcrEngine.creator_tag(MagicMock())
|
|
tag_lower = tag.lower()
|
|
assert "no ocr" in tag_lower or "null" in tag_lower or "none" in tag_lower
|
|
|
|
def test_languages_returns_empty_set(self):
|
|
"""NullOcrEngine.languages() should return empty set."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
langs = NullOcrEngine.languages(MagicMock())
|
|
assert langs == set()
|
|
|
|
def test_supports_generate_ocr_returns_true(self):
|
|
"""NullOcrEngine should support generate_ocr()."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
assert NullOcrEngine.supports_generate_ocr() is True
|
|
|
|
def test_get_orientation_returns_zero(self):
|
|
"""NullOcrEngine.get_orientation() should return angle=0."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
result = NullOcrEngine.get_orientation(Path("test.png"), MagicMock())
|
|
assert result.angle == 0
|
|
|
|
def test_get_deskew_returns_zero(self):
|
|
"""NullOcrEngine.get_deskew() should return 0.0."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
result = NullOcrEngine.get_deskew(Path("test.png"), MagicMock())
|
|
assert result == 0.0
|
|
|
|
|
|
class TestNullOcrEngineGenerateOcr:
|
|
"""Test NullOcrEngine.generate_ocr() output."""
|
|
|
|
@pytest.fixture
|
|
def sample_image(self, tmp_path):
|
|
"""Create a simple test image."""
|
|
from PIL import Image
|
|
|
|
img_path = tmp_path / "test.png"
|
|
img = Image.new('RGB', (100, 100), color='white')
|
|
img.save(img_path, dpi=(300, 300))
|
|
return img_path
|
|
|
|
def test_generate_ocr_returns_tuple(self, sample_image):
|
|
"""generate_ocr() should return (OcrElement, str) tuple."""
|
|
from ocrmypdf import OcrElement
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
result = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
|
|
|
|
assert isinstance(result, tuple)
|
|
assert len(result) == 2
|
|
assert isinstance(result[0], OcrElement)
|
|
assert isinstance(result[1], str)
|
|
|
|
def test_generate_ocr_returns_empty_text(self, sample_image):
|
|
"""generate_ocr() should return empty text string."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
_, text = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
|
|
|
|
assert text == ""
|
|
|
|
def test_generate_ocr_returns_page_element(self, sample_image):
|
|
"""generate_ocr() should return OcrElement with ocr_class PAGE."""
|
|
from ocrmypdf import OcrClass
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
|
|
|
|
assert ocr_tree.ocr_class == OcrClass.PAGE
|
|
|
|
def test_generate_ocr_page_has_correct_dimensions(self, sample_image):
|
|
"""generate_ocr() page element should have image dimensions."""
|
|
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
|
|
|
|
ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
|
|
|
|
# Image is 100x100
|
|
assert ocr_tree.bbox.right == 100
|
|
assert ocr_tree.bbox.bottom == 100
|
|
|
|
|
|
class TestOcrEngineOption:
|
|
"""Test --ocr-engine CLI option."""
|
|
|
|
def test_ocr_engine_option_accepted(self):
|
|
"""CLI should accept --ocr-engine option."""
|
|
from ocrmypdf.cli import get_parser
|
|
|
|
parser = get_parser()
|
|
|
|
# Should not raise
|
|
args = parser.parse_args(['--ocr-engine', 'none', 'in.pdf', 'out.pdf'])
|
|
assert args.ocr_engine == 'none'
|
|
|
|
def test_ocr_engine_choices_include_none(self):
|
|
"""--ocr-engine should include 'none' as a choice."""
|
|
from ocrmypdf.cli import get_parser
|
|
|
|
parser = get_parser()
|
|
|
|
# Find the --ocr-engine action
|
|
for action in parser._actions:
|
|
if '--ocr-engine' in action.option_strings:
|
|
assert 'none' in action.choices
|
|
break
|
|
else:
|
|
pytest.fail("--ocr-engine option not found")
|
|
|
|
def test_ocr_engine_choices_include_auto(self):
|
|
"""--ocr-engine should include 'auto' as default."""
|
|
from ocrmypdf.cli import get_parser
|
|
|
|
parser = get_parser()
|
|
|
|
for action in parser._actions:
|
|
if '--ocr-engine' in action.option_strings:
|
|
assert 'auto' in action.choices
|
|
assert action.default == 'auto'
|
|
break
|