Files
OCRmyPDF/tests/plugins/tesseract_debug_rotate.py
James R. Barlow bbd263ff48 Add tests for fpdf2 renderer and font infrastructure
- Add hOCR test fixtures for Latin, Arabic, CJK, Devanagari scripts
- Add tests for fpdf2 renderer, multi-font manager, system font provider
- Add multilingual rendering tests
- Update existing tests to use fpdf2 renderer
2026-01-06 13:46:11 -08:00

99 lines
3.0 KiB
Python

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MIT
"""Tesseract no-op/fixed rotate plugin.
To quickly run tests where getting OCR output is not necessary and we want to test
the rotation pipeline.
In generate_hocr mode, create a .hocr file that specifies no text found.
In 'pdf' mode, convert the image to PDF using another program.
In orientation check mode, report 0, 90, 180, 270... based on page number.
"""
from __future__ import annotations
import pikepdf
from PIL import Image
from ocrmypdf import OcrEngine, OrientationConfidence, hookimpl
from ocrmypdf.helpers import page_number
HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name='ocr-system' content='tesseract 4.1.1' />
<meta name='ocr-capabilities'
content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "x.tif"; bbox 0 0 {0} {1}; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 0 1 {0} {1}">
<p class='ocr_par' dir='ltr' id='par_1' title="bbox 0 1 {0} {1}">
<span class='ocr_line' id='line_1' title="bbox 0 1 {0} {1}">
<span class='ocrx_word' id='word_1' title="bbox 0 1 {0} {1}"> </span>
</span>
</p>
</div>
</div>
</body>
</html>'''
class FixedRotateNoopOcrEngine(OcrEngine):
@staticmethod
def version():
return '4.1.1'
@staticmethod
def creator_tag(options):
tag = '-PDF' if options.pdf_renderer == 'sandwich' else '-hOCR'
return f"NO-OP {tag} {FixedRotateNoopOcrEngine.version()}"
def __str__(self):
return f"NO-OP {FixedRotateNoopOcrEngine.version()}"
@staticmethod
def languages(options):
return {'eng'}
@staticmethod
def get_orientation(input_file, options):
page = page_number(input_file)
angle = ((page - 1) * 90) % 360
return OrientationConfidence(angle=angle, confidence=99.9)
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
with (
Image.open(input_file) as im,
open(output_hocr, 'w', encoding='utf-8') as f,
):
w, h = im.size
f.write(HOCR_TEMPLATE.format(str(w), str(h)))
with open(output_text, 'w') as f:
f.write('')
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
with Image.open(input_file) as im:
dpi = im.info['dpi']
pagesize = im.size[0] / dpi[0], im.size[1] / dpi[1]
ptsize = pagesize[0] * 72, pagesize[1] * 72
pdf = pikepdf.new()
pdf.add_blank_page(page_size=ptsize)
pdf.save(output_pdf, static_id=True)
output_text.write_text('')
@hookimpl
def get_ocr_engine():
return FixedRotateNoopOcrEngine()