mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-05 05:05:44 -04:00
Consider text when determining page raster DPI
Previously if we found vectors of any sort on a page, we would bump the DPI up to 400. We did nothing about pages with text. As a result, pages with a low image resolution and printable text would have the text downgraded to image resolution when --force-ocr was used. We don't try to determine if the text is visible or invisible OCR text, since that is a slower test. --redo-ocr would improve such cases anyway.
This commit is contained in:
@@ -206,17 +206,21 @@ def validate_pdfinfo_options(context: PdfContext):
|
||||
context.plugin_manager.hook.validate(pdfinfo=pdfinfo, options=options)
|
||||
|
||||
|
||||
def _vector_page_dpi(pageinfo):
|
||||
return VECTOR_PAGE_DPI if pageinfo.has_vector or pageinfo.has_text else 0.0
|
||||
|
||||
|
||||
def get_page_dpi(pageinfo, options):
|
||||
"Get the DPI when nonsquare DPI is tolerable"
|
||||
xres = max(
|
||||
pageinfo.dpi.x or VECTOR_PAGE_DPI,
|
||||
options.oversample or 0.0,
|
||||
VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0,
|
||||
_vector_page_dpi(pageinfo),
|
||||
)
|
||||
yres = max(
|
||||
pageinfo.dpi.y or VECTOR_PAGE_DPI,
|
||||
options.oversample or 0,
|
||||
VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0,
|
||||
_vector_page_dpi(pageinfo),
|
||||
)
|
||||
return Resolution(float(xres), float(yres))
|
||||
|
||||
@@ -230,7 +234,7 @@ def get_page_square_dpi(pageinfo, options) -> Resolution:
|
||||
max(
|
||||
(xres * userunit) or VECTOR_PAGE_DPI,
|
||||
(yres * userunit) or VECTOR_PAGE_DPI,
|
||||
VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0,
|
||||
_vector_page_dpi(pageinfo),
|
||||
options.oversample or 0.0,
|
||||
)
|
||||
)
|
||||
@@ -243,7 +247,7 @@ def get_canvas_square_dpi(pageinfo, options) -> Resolution:
|
||||
max(
|
||||
(pageinfo.dpi.x) or VECTOR_PAGE_DPI,
|
||||
(pageinfo.dpi.y) or VECTOR_PAGE_DPI,
|
||||
VECTOR_PAGE_DPI if pageinfo.has_vector else 0.0,
|
||||
_vector_page_dpi(pageinfo),
|
||||
options.oversample or 0.0,
|
||||
)
|
||||
)
|
||||
|
||||
63
tests/test_pipeline.py
Normal file
63
tests/test_pipeline.py
Normal file
@@ -0,0 +1,63 @@
|
||||
# © 2021 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from reportlab.lib.units import inch
|
||||
from reportlab.lib.utils import ImageReader
|
||||
from reportlab.pdfgen.canvas import Canvas
|
||||
|
||||
from ocrmypdf import _pipeline, pdfinfo
|
||||
from ocrmypdf.helpers import Resolution
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def rgb_image():
|
||||
im = Image.new('RGB', (8, 8))
|
||||
im.putpixel((4, 4), (255, 0, 0))
|
||||
im.putpixel((5, 5), (0, 255, 0))
|
||||
im.putpixel((6, 6), (0, 0, 255))
|
||||
return ImageReader(im)
|
||||
|
||||
|
||||
DUMMY_OVERSAMPLE_RESOLUTION = Resolution(42.0, 42.0)
|
||||
VECTOR_RESOLUTION = Resolution(_pipeline.VECTOR_PAGE_DPI, _pipeline.VECTOR_PAGE_DPI)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'image, text, vector, result',
|
||||
[
|
||||
(False, False, False, VECTOR_RESOLUTION),
|
||||
(False, True, False, VECTOR_RESOLUTION),
|
||||
(True, False, False, DUMMY_OVERSAMPLE_RESOLUTION),
|
||||
(True, True, False, VECTOR_RESOLUTION),
|
||||
(False, False, True, VECTOR_RESOLUTION),
|
||||
(False, True, True, VECTOR_RESOLUTION),
|
||||
(True, False, True, VECTOR_RESOLUTION),
|
||||
(True, True, True, VECTOR_RESOLUTION),
|
||||
],
|
||||
)
|
||||
def test_dpi_needed(image, text, vector, result, rgb_image, outdir):
|
||||
|
||||
c = Canvas(str(outdir / 'dpi.pdf'), pagesize=(5 * inch, 5 * inch))
|
||||
if image:
|
||||
c.drawImage(rgb_image, 1 * inch, 1 * inch, width=1 * inch, height=1 * inch)
|
||||
if text:
|
||||
c.drawString(1 * inch, 4 * inch, "Actual text")
|
||||
if vector:
|
||||
c.ellipse(3 * inch, 3 * inch, 4 * inch, 4 * inch)
|
||||
c.showPage()
|
||||
c.save()
|
||||
|
||||
mock = Mock()
|
||||
mock.oversample = DUMMY_OVERSAMPLE_RESOLUTION[0]
|
||||
|
||||
pi = pdfinfo.PdfInfo(outdir / 'dpi.pdf')
|
||||
|
||||
assert _pipeline.get_canvas_square_dpi(pi[0], mock) == result
|
||||
assert _pipeline.get_page_square_dpi(pi[0], mock) == result
|
||||
Reference in New Issue
Block a user