diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 4b20f541..051a1981 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -228,28 +228,19 @@ def _vector_page_dpi(pageinfo: PageInfo) -> int: return VECTOR_PAGE_DPI if pageinfo.has_vector or pageinfo.has_text else 0 -def get_page_dpi(pageinfo: PageInfo, options) -> Resolution: - """Get the DPI when nonsquare DPI is tolerable.""" - xres = max( - pageinfo.dpi.x or VECTOR_PAGE_DPI, - options.oversample or 0.0, - _vector_page_dpi(pageinfo), - ) - yres = max( - pageinfo.dpi.y or VECTOR_PAGE_DPI, - options.oversample or 0, - _vector_page_dpi(pageinfo), - ) - return Resolution(float(xres), float(yres)) - - -def get_page_square_dpi(pageinfo: PageInfo, options) -> Resolution: +def get_page_square_dpi( + page_context: PageContext, image_dpi: Resolution | None = None +) -> Resolution: """Get the DPI when we require xres == yres, scaled to physical units. Page DPI includes UserUnit scaling. """ - xres = pageinfo.dpi.x or 0.0 - yres = pageinfo.dpi.y or 0.0 + pageinfo = page_context.pageinfo + options = page_context.options + if not image_dpi: + image_dpi = pageinfo.dpi + xres = image_dpi.x or 0.0 + yres = image_dpi.y or 0.0 userunit = float(pageinfo.userunit) or 1.0 units = float( max( @@ -262,17 +253,23 @@ def get_page_square_dpi(pageinfo: PageInfo, options) -> Resolution: return Resolution(units, units) -def get_canvas_square_dpi(pageinfo: PageInfo, options) -> Resolution: +def get_canvas_square_dpi( + page_context: PageContext, image_dpi: Resolution | None = None +) -> Resolution: """Get the DPI when we require xres == yres, in Postscript units. Canvas DPI is independent of PDF UserUnit scaling, which is used to describe situations where the PDF user space is not 1:1 with the physical units of the page. """ + pageinfo = page_context.pageinfo + options = page_context.options + if not image_dpi: + image_dpi = pageinfo.dpi units = float( max( - (pageinfo.dpi.x) or VECTOR_PAGE_DPI, - (pageinfo.dpi.y) or VECTOR_PAGE_DPI, + image_dpi.x or VECTOR_PAGE_DPI, + image_dpi.y or VECTOR_PAGE_DPI, _vector_page_dpi(pageinfo), options.oversample or 0.0, ) @@ -360,11 +357,9 @@ def rasterize_preview(input_file: Path, page_context: PageContext) -> Path: """Generate a lower quality preview image.""" output_file = page_context.get_path('rasterize_preview.jpg') canvas_dpi = Resolution(300.0, 300.0).take_min( - [get_canvas_square_dpi(page_context.pageinfo, page_context.options)] - ) - page_dpi = Resolution(300.0, 300.0).take_min( - [get_page_square_dpi(page_context.pageinfo, page_context.options)] + [get_canvas_square_dpi(page_context)] ) + page_dpi = Resolution(300.0, 300.0).take_min([get_page_square_dpi(page_context)]) page_context.plugin_manager.hook.rasterize_pdf_page( input_file=input_file, output_file=output_file, @@ -438,6 +433,37 @@ def get_orientation_correction(preview: Path, page_context: PageContext) -> int: return 0 +def calculate_image_dpi(page_context: PageContext) -> Resolution: + pageinfo = page_context.pageinfo + dpi_profile = pageinfo.page_dpi_profile() + if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8: + image_dpi = Resolution(dpi_profile.weighted_dpi, dpi_profile.weighted_dpi) + else: + image_dpi = pageinfo.dpi + return image_dpi + + +def calculate_raster_dpi(page_context: PageContext): + """Calculate the DPI for rasterization.""" + # Produce the page image with square resolution or else deskew and OCR + # will not work properly. + image_dpi = calculate_image_dpi(page_context) + dpi_profile = page_context.pageinfo.page_dpi_profile() + canvas_dpi = get_canvas_square_dpi(page_context, image_dpi) + page_dpi = get_page_square_dpi(page_context, image_dpi) + if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8: + log.warning( + "Weight average image DPI is %0.1f, max DPI is %0.1f. " + "The discrepancy may indicate a high detail region on this page, " + "but could also indicate a problem with the input PDF file. " + "Page image will be rendered at %0.1f DPI.", + dpi_profile.weighted_dpi, + dpi_profile.max_dpi, + canvas_dpi.to_scalar(), + ) + return canvas_dpi, page_dpi + + def rasterize( input_file: Path, page_context: PageContext, @@ -489,25 +515,7 @@ def rasterize( log.debug(f"Rasterize with {device}, rotation {correction}") - # Produce the page image with square resolution or else deskew and OCR - # will not work properly. - canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options) - page_dpi = get_page_square_dpi(pageinfo, page_context.options) - - dpi_profile = pageinfo.page_dpi_profile() - if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8: - log.warning( - "Weight average DPI is %0.1f, max DPI is %0.1f. " - "The discrepancy may indicate a high detail region on this page, " - "but could also indicate a problem with the input PDF file. " - "An image will be rendered at %0.1f DPI.", - dpi_profile.weighted_dpi, - dpi_profile.max_dpi, - dpi_profile.weighted_dpi, - ) - canvas_dpi = page_dpi = Resolution( - dpi_profile.weighted_dpi, dpi_profile.weighted_dpi - ) + canvas_dpi, page_dpi = calculate_raster_dpi(page_context) page_context.plugin_manager.hook.rasterize_pdf_page( input_file=input_file, @@ -544,7 +552,7 @@ def preprocess_deskew(input_file: Path, page_context: PageContext) -> Path: Path: The path to the deskewed image file. """ output_file = page_context.get_path('pp_deskew.png') - dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) + dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context)) ocr_engine = page_context.plugin_manager.hook.get_ocr_engine() deskew_angle_degrees = ocr_engine.get_deskew(input_file, page_context.options) @@ -564,11 +572,11 @@ def preprocess_deskew(input_file: Path, page_context: PageContext) -> Path: def preprocess_clean(input_file: Path, page_context: PageContext) -> Path: output_file = page_context.get_path('pp_clean.png') - dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) + dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context)) return unpaper.clean( input_file, output_file, - dpi=dpi.x, + dpi=dpi.to_scalar(), unpaper_args=page_context.options.unpaper_args, ) @@ -665,7 +673,7 @@ def create_visible_page_jpg(image: Path, page_context: PageContext) -> Path: dpi = Resolution(*im.info['dpi']) else: # Fallback to page-implied DPI - dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) + dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context)) # Pillow requires integer DPI im.save(output_file, format='JPEG', dpi=dpi.to_int()) @@ -708,7 +716,7 @@ def create_pdf_page_from_image( def render_hocr_page(hocr: Path, page_context: PageContext) -> Path: options = page_context.options output_file = page_context.get_path('ocr_hocr.pdf') - dpi = get_page_square_dpi(page_context.pageinfo, options) + dpi = get_page_square_dpi(page_context, calculate_image_dpi(page_context)) debug_mode = options.pdf_renderer == 'hocrdebug' hocrtransform = HocrTransform(hocr_filename=hocr, dpi=dpi.x) # square diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index 56030e60..a787f85c 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -9,6 +9,8 @@ from __future__ import annotations import atexit import logging import re +import statistics +import sys from collections import defaultdict from contextlib import ExitStack from decimal import Decimal @@ -994,12 +996,12 @@ class PageInfo: weights = [area / total_drawn_area for area in image_areas] # Calculate harmonic mean of DPIs weighted by area - # When the minimum version is Python 3.10, change this to - # statistics.harmonic_mean with the weights parameter - # rather than doing it manually. - weighted_dpi = sum(weights) / sum( - weight / dpi for weight, dpi in zip(weights, image_dpis) - ) + if sys.version_info >= (3, 10): + weighted_dpi = statistics.harmonic_mean(image_dpis, weights) + else: + weighted_dpi = sum(weights) / sum( + weight / dpi for weight, dpi in zip(weights, image_dpis) + ) max_dpi = max(image_dpis) dpi_average_max_ratio = weighted_dpi / max_dpi @@ -1007,7 +1009,10 @@ class PageInfo: max_area_ratio = image_areas[arg_max_dpi] / total_drawn_area return PageResolutionProfile( - weighted_dpi, max_dpi, dpi_average_max_ratio, max_area_ratio + weighted_dpi, + max_dpi, + dpi_average_max_ratio, + max_area_ratio, ) def __repr__(self): diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 518daf35..9eb6a820 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -52,13 +52,14 @@ def test_dpi_needed(image, text, vector, result, rgb_image, outdir): c.showPage() c.save() - mock = Mock() - mock.oversample = DUMMY_OVERSAMPLE_RESOLUTION[0] - pi = pdfinfo.PdfInfo(outdir / 'dpi.pdf') + pageinfo = pi[0] + ctx = Mock() + ctx.options.oversample = DUMMY_OVERSAMPLE_RESOLUTION[0] + ctx.pageinfo = pageinfo - assert _pipeline.get_canvas_square_dpi(pi[0], mock) == result - assert _pipeline.get_page_square_dpi(pi[0], mock) == result + assert _pipeline.get_canvas_square_dpi(ctx) == result + assert _pipeline.get_page_square_dpi(ctx) == result @pytest.mark.parametrize(