Merge branch 'feature/fix-raster-dpi-too-high' into v15

2026-05-06 21:56:21 -04:00 · 2023-09-21 00:05:56 -07:00
parent 0f0ca6f517 173428e81a
commit 3e1b3ec98d
17 changed files with 366 additions and 121 deletions
--- a/docs/advanced.rst
+++ b/docs/advanced.rst
@@ -114,6 +114,41 @@ exceed a certain number of megapixels with ``--skip-big``. (A 300 DPI,
    # Allow 300 seconds for OCR; skip any page larger than 50 megapixels
    ocrmypdf --tesseract-timeout 300 --skip-big 50 bigfile.pdf output.pdf

+OCR for huge images
+-------------------
+
+Separate from these settings, Tesseract has internal limits on the size
+of images it will process. If you issue
+``--tesseract-downsample-large-images``, OCRmyPDF will downsample images
+to fit Tesseract limits. (The limits are usually entered only for scanned
+images of oversized media, such as large maps or blueprints exceeding
+110 cm or 43 inches in either dimension, and at high DPI.)
+
+``--tesseract-downsample-above`` adjusts the threshold at which images
+will be downsampled. By default, only images that exceed any of Tesseract's
+internal limits are downsampled.
+
+You will also need to set ``--tesseract-timeout`` high enough to allow
+for processing.
+
+Only the image sent for OCR is downsampled. The original image is
+preserved.
+
+.. code-block:: bash
+
+    # Allow 600 seconds for OCR on huge images
+    ocrmypdf --tesseract-timeout 600 \
+        --tesseract-downsample-large-images \
+        bigfile.pdf output.pdf
+
+    # Downsample images above 5000 pixels on the longest dimension to
+    # 5000 pixels
+    ocrmypdf --tesseract-timeout 120 \
+        --tesseract-downsample-large-images \
+        --tesseract-downsample-above 5000 \
+        bigfile.pdf output_downsampled_ocr.pdf
+
+
 Overriding default tesseract
 ----------------------------

--- a/misc/completion/ocrmypdf.bash
+++ b/misc/completion/ocrmypdf.bash
@@ -6,53 +6,55 @@ set -o errexit

 __ocrmypdf_arguments()
 {
-    local arguments="--help                   (show help message)
--language               (language(s) of the file to be OCRed)
--image-dpi              (assume this DPI if input image DPI is unknown)
--output-type            (select PDF output options)
--sidecar                (write OCR to text file)
--version                (print program version and exit)
--jobs                   (how many worker processes to use)
--quiet                  (suppress INFO messages)
--verbose                (set verbosity level)
--title                  (set metadata)
--author                 (set metadata)
--subject                (set metadata)
--keywords               (set metadata)
--rotate-pages           (rotate pages to correct orientation)
--remove-background      (attempt to remove background from pages)
--deskew                 (fix small horizontal alignment skew)
--clean                  (clean document images before OCR)
--clean-final            (clean document images and keep result)
--unpaper-args           (a quoted string of arguments to pass to unpaper)
--oversample             (oversample images to this DPI)
--remove-vectors         (don\'t send vector objects to OCR)
--threshold              (threshold images before OCR)
--force-ocr              (OCR documents that already have printable text)
--skip-text              (skip OCR on any pages that already contain text)
--redo-ocr               (redo OCR on any pages that seem to have OCR already)
+    local arguments="\
+--help                          (show help message)
+--language                      (language(s) of the file to be OCRed)
+--image-dpi                     (assume this DPI if input image DPI is unknown)
+--output-type                   (select PDF output options)
+--sidecar                       (write OCR to text file)
+--version                       (print program version and exit)
+--jobs                          (how many worker processes to use)
+--quiet                         (suppress INFO messages)
+--verbose                       (set verbosity level)
+--title                         (set metadata)
+--author                        (set metadata)
+--subject                       (set metadata)
+--keywords                      (set metadata)
+--rotate-pages                  (rotate pages to correct orientation)
+--remove-background             (attempt to remove background from pages)
+--deskew                        (fix small horizontal alignment skew)
+--clean                         (clean document images before OCR)
+--clean-final                   (clean document images and keep result)
+--unpaper-args                  (a quoted string of arguments to pass to unpaper)
+--oversample                    (oversample images to this DPI)
+--remove-vectors                (don\'t send vector objects to OCR)
+--threshold                     (threshold images before OCR)
+--force-ocr                     (OCR documents that already have printable text)
+--skip-text                     (skip OCR on any pages that already contain text)
+--redo-ocr                      (redo OCR on any pages that seem to have OCR already)
 --invalidate-digital-signatures (remove digital signatures from PDF)
--skip-big               (skip OCR on pages larger than this many MPixels)
--optimize               (select optimization level)
--jpeg-quality           (JPEG quality [0..100])
--png-quality            (PNG quality [0..100])
--jbig2-lossy            (enable lossy JBIG2 (see docs))
--pages                  (apply OCR to only the specified pages)
--max-image-mpixels      (image decompression bomb threshold)
--pdf-renderer           (select PDF renderer options)
--rotate-pages-threshold (page rotation confidence)
--pdfa-image-compression (set PDF/A image compression options)
--fast-web-view          (if file size if above this amount in MB linearize PDF)
--plugin                 (name of plugin to import)
--keep-temporary-files   (keep temporary files (debug)
--tesseract-config       (set custom tesseract config file)
--tesseract-pagesegmode  (set tesseract --psm)
--tesseract-oem          (set tesseract --oem)
--tesseract-thresholding (set tesseract image thresholding)
--tesseract-timeout      (maximum number of seconds to wait for OCR)
--user-words             (specify location of user words file)
--user-patterns          (specify location of user patterns file)
--no-progress-bar        (disable the progress bar)
+--skip-big                      (skip OCR on pages larger than this many MPixels)
+--optimize                      (select optimization level)
+--jpeg-quality                  (JPEG quality [0..100])
+--png-quality                   (PNG quality [0..100])
+--jbig2-lossy                   (enable lossy JBIG2 (see docs))
+--pages                         (apply OCR to only the specified pages)
+--max-image-mpixels             (image decompression bomb threshold)
+--pdf-renderer                  (select PDF renderer options)
+--rotate-pages-threshold        (page rotation confidence)
+--pdfa-image-compression        (set PDF/A image compression options)
+--fast-web-view                 (if file size if above this amount in MB linearize PDF)
+--plugin                        (name of plugin to import)
+--keep-temporary-files          (keep temporary files (debug)
+--tesseract-config              (set custom tesseract config file)
+--tesseract-pagesegmode         (set tesseract --psm)
+--tesseract-oem                 (set tesseract --oem)
+--tesseract-thresholding        (set tesseract image thresholding)
+--tesseract-timeout             (maximum number of seconds to wait for OCR)
+--user-words                    (specify location of user words file)
+--user-patterns                 (specify location of user patterns file)
+--no-progress-bar               (disable the progress bar)
+--color-conversion-strategy     (select color conversion strategy)
 "

    COMPREPLY=( $( compgen -W "$arguments" -- "$cur") )
@@ -192,6 +194,20 @@ sauvola       (use Sauvola thresholding)"
    fi
 }

+__ocrmypdf_color-conversion-strategy()
+{
+    local choices="LeaveColorUnchanged (default)
+CMYK (convert to CMYK)
+Gray (convert to grayscale)
+RGB (convert to RGB)
+UseDeviceIndependentColor (convert with device independent color)"
+
+    COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
+    # Remove description if only one completion exists
+    if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
+        COMPREPLY=( ${COMPREPLY[0]%% *} )
+    fi
+}

 __ocrmypdf_check_previous()
 {
@@ -251,6 +267,10 @@ __ocrmypdf_check_previous()
            _filedir
            return 0
            ;;
+        --color-conversion-strategy)
+            __ocrmypdf_color-conversion-strategy
+            return 0
+            ;;
    esac

    return 1
--- a/misc/completion/ocrmypdf.fish
+++ b/misc/completion/ocrmypdf.fish
@@ -129,4 +129,27 @@ complete -c ocrmypdf -r -l user-words -d "specify location of user words file"
 complete -c ocrmypdf -r -l user-patterns -d "specify location of user patterns file"
 complete -c ocrmypdf -x -l fast-web-view -d "if file size if above this amount in MB, linearize PDF"

-complete -c ocrmypdf -x -a "(__fish_complete_suffix .pdf; __fish_complete_suffix .PDF; __fish_complete_suffix .jpg; __fish_complete_suffix .png)"
+function __fish_ocrmypdf_color_conversion_strategy
+    echo -e "LeaveColorUnchanged\t"(_ "do not convert color spaces (default)")
+    echo -e "CMYK\t"(_ "convert all color spaces to CMYK")
+    echo -e "Gray\t"(_ "convert all color spaces to grayscale")
+    echo -e "RGB\t"(_ "convert all color spaces to RGB")
+    echo -e "UseDeviceIndependentColor\t"(_ "convert all color spaces to ICC-based color spaces")
+end
+
+complete -c ocrmypdf -x -l color-conversion-strategy -a '(__fish_ocrmypdf_color_conversion_strategy)' -d "set color conversion strategy"
+
+function __fish_ocrmypdf_input_file_given
+    set -l tokens (commandline -opc)
+    for token in $tokens
+        if string match -q -r '^-' -- $token
+            continue
+        end
+        if test -f "$token"
+            return 0
+        end
+    end
+    return 1
+end
+
+complete -c ocrmypdf -x -n 'not __fish_ocrmypdf_input_file_given' -a "(__fish_complete_suffix .pdf)" -d "input file"
--- a/src/ocrmypdf/_exec/ghostscript.py
+++ b/src/ocrmypdf/_exec/ghostscript.py
@@ -28,6 +28,17 @@ except AttributeError:
    # Pillow 9 shim
    Transpose = Image  # type: ignore

+
+COLOR_CONVERSION_STRATEGIES = frozenset(
+    [
+        'CMYK',
+        'Gray',
+        'LeaveColorUnchanged',
+        'RGB',
+        'UseDeviceIndependentColor',
+    ]
+)
+
 log = logging.getLogger(__name__)


@@ -151,6 +162,7 @@ def generate_pdfa(
    output_file: os.PathLike,
    *,
    compression: str,
+    color_conversion_strategy: str,
    pdf_version: str = '1.5',
    pdfa_part: str = '2',
    progressbar_class=None,
@@ -200,16 +212,16 @@ def generate_pdfa(
            "-dBATCH",
            "-dNOPAUSE",
            "-dSAFER",
-            "-dCompatibilityLevel=" + str(pdf_version),
+            f"-dCompatibilityLevel={str(pdf_version)}",
            "-sDEVICE=pdfwrite",
            "-dAutoRotatePages=/None",
-            "-sColorConversionStrategy=" + strategy,
+            f"-sColorConversionStrategy={color_conversion_strategy}",
        ]
        + (['-dPDFSTOPONERROR'] if stop_on_error else [])
        + compression_args
        + [
            "-dJPEGQ=95",
-            "-dPDFA=" + pdfa_part,
+            f"-dPDFA={pdfa_part}",
            "-dPDFACompatibilityPolicy=1",
            "-o",
            "-",
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -359,8 +359,12 @@ def is_ocr_required(page_context: PageContext) -> bool:
 def rasterize_preview(input_file: Path, page_context: PageContext) -> Path:
    """Generate a lower quality preview image."""
    output_file = page_context.get_path('rasterize_preview.jpg')
-    canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options)
-    page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
+    canvas_dpi = Resolution(300.0, 300.0).take_min(
+        [get_canvas_square_dpi(page_context.pageinfo, page_context.options)]
+    )
+    page_dpi = Resolution(300.0, 300.0).take_min(
+        [get_page_square_dpi(page_context.pageinfo, page_context.options)]
+    )
    page_context.plugin_manager.hook.rasterize_pdf_page(
        input_file=input_file,
        output_file=output_file,
@@ -490,6 +494,21 @@ def rasterize(
    canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options)
    page_dpi = get_page_square_dpi(pageinfo, page_context.options)

+    dpi_profile = pageinfo.page_dpi_profile()
+    if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:
+        log.warning(
+            "Weight average DPI is %0.1f, max DPI is %0.1f. "
+            "The discrepancy may indicate a high detail region on this page, "
+            "but could also indicate a problem with the input PDF file. "
+            "An image will be rendered at %0.1f DPI.",
+            dpi_profile.weighted_dpi,
+            dpi_profile.max_dpi,
+            dpi_profile.weighted_dpi,
+        )
+        canvas_dpi = page_dpi = Resolution(
+            dpi_profile.weighted_dpi, dpi_profile.weighted_dpi
+        )
+
    page_context.plugin_manager.hook.rasterize_pdf_page(
        input_file=input_file,
        output_file=output_file,
@@ -792,7 +811,7 @@ def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext) -
        pdf_pages=[fix_docinfo_file],
        pdfmark=input_ps_stub,
        output_file=output_file,
-        compression=options.pdfa_image_compression,
+        context=context,
        pdfa_part=options.output_type[-1],  # is pdfa-1, pdfa-2, or pdfa-3
        progressbar_class=(
            context.plugin_manager.hook.get_progressbar_class()
--- a/src/ocrmypdf/_validation.py
+++ b/src/ocrmypdf/_validation.py
@@ -205,16 +205,6 @@ def check_options_ocr_behavior(options: Namespace) -> None:
        options.pages = _pages_from_ranges(options.pages)


-def check_options_advanced(options: Namespace) -> None:
-    if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
-        'pdfa'
-    ):
-        log.warning(
-            "--pdfa-image-compression argument only applies when "
-            "--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
-        )
-
-
 def check_options_metadata(options: Namespace) -> None:
    docinfo = [options.title, options.author, options.keywords, options.subject]
    for s in (m for m in docinfo if m):
@@ -241,7 +231,6 @@ def _check_plugin_invariant_options(options: Namespace) -> None:
    check_options_sidecar(options)
    check_options_preprocessing(options)
    check_options_ocr_behavior(options)
-    check_options_advanced(options)
    check_options_pillow(options)


--- a/src/ocrmypdf/builtin_plugins/ghostscript.py
+++ b/src/ocrmypdf/builtin_plugins/ghostscript.py
@@ -18,6 +18,33 @@ log = logging.getLogger(__name__)
 BLACKLISTED_GS_VERSIONS = frozenset()


+@hookimpl
+def add_options(parser):
+    gs = parser.add_argument_group("Ghostscript", "Advanced control of Ghostscript")
+    gs.add_argument(
+        '--color-conversion-strategy',
+        action='store',
+        type=str,
+        metavar='STRATEGY',
+        choices=ghostscript.COLOR_CONVERSION_STRATEGIES,
+        default='LeaveColorUnchanged',
+        help="Set Ghostscript color conversion strategy",
+    )
+    gs.add_argument(
+        '--pdfa-image-compression',
+        choices=['auto', 'jpeg', 'lossless'],
+        default='auto',
+        help="Specify how to compress images in the output PDF/A. 'auto' lets "
+        "OCRmyPDF decide.  'jpeg' changes all grayscale and color images to "
+        "JPEG compression.  'lossless' uses PNG-style lossless compression "
+        "for all images.  Monochrome images are always compressed using a "
+        "lossless codec.  Compression settings "
+        "are applied to all pages, including those for which OCR was "
+        "skipped.  Not supported for --output-type=pdf ; that setting "
+        "preserves the original compression of all images.",
+    )
+
+
@hookimpl
 def check_options(options):
    """Check that the options are valid for this plugin."""
@@ -37,6 +64,17 @@ def check_options(options):

    if options.output_type == 'pdfa':
        options.output_type = 'pdfa-2'
+    if options.color_conversion_strategy not in ghostscript.COLOR_CONVERSION_STRATEGIES:
+        raise ValueError(
+            f"Invalid color conversion strategy: {options.color_conversion_strategy}"
+        )
+    if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
+        'pdfa'
+    ):
+        log.warning(
+            "--pdfa-image-compression argument only applies when "
+            "--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
+        )


@hookimpl
@@ -71,7 +109,7 @@ def generate_pdfa(
    pdf_pages,
    pdfmark,
    output_file,
-    compression,
+    context,
    pdf_version,
    pdfa_part,
    progressbar_class,
@@ -81,7 +119,8 @@ def generate_pdfa(
    ghostscript.generate_pdfa(
        pdf_pages=[*pdf_pages, pdfmark],
        output_file=output_file,
-        compression=compression,
+        compression=context.options.pdfa_image_compression,
+        color_conversion_strategy=context.options.color_conversion_strategy,
        pdf_version=pdf_version,
        pdfa_part=pdfa_part,
        progressbar_class=progressbar_class,
--- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
+++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
@@ -30,7 +30,7 @@ def add_options(parser):
        action='append',
        metavar='CFG',
        default=[],
-        help="Additional Tesseract configuration files -- see documentation",
+        help="Additional Tesseract configuration files -- see documentation.",
    )
    tess.add_argument(
        '--tesseract-pagesegmode',
@@ -38,7 +38,7 @@ def add_options(parser):
        type=int,
        metavar='PSM',
        choices=range(0, 14),
-        help="Set Tesseract page segmentation mode (see tesseract --help)",
+        help="Set Tesseract page segmentation mode (see tesseract --help).",
    )
    tess.add_argument(
        '--tesseract-oem',
@@ -75,7 +75,10 @@ def add_options(parser):
        metavar='SECONDS',
        help=(
            "Give up on OCR after the timeout, but copy the preprocessed page "
-            "into the final output."
+            "into the final output. This timeout is only used when using Tesseract "
+            "for OCR. When Tesseract is used for other operations such as "
+            "deskewing and orientation, the timeout is controlled by "
+            "--tesseract-non-ocr-timeout."
        ),
    )
    tess.add_argument(
@@ -175,6 +178,15 @@ def validate(pdfinfo, options):
        tess_threads = int(os.environ['OMP_THREAD_LIMIT'])
    log.debug("Using Tesseract OpenMP thread limit %d", tess_threads)

+    if (
+        options.tesseract_downsample_above != 32767
+        and not options.tesseract_downsample_large_images
+    ):
+        log.warning(
+            "The --tesseract-downsample-above argument will have no effect unless "
+            "--tesseract-downsample-large-images is also given."
+        )
+

@hookimpl
 def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:
--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@@ -177,7 +177,9 @@ Online documentation is located at:
        '--image-dpi',
        metavar='DPI',
        type=int,
-        help="For input image instead of PDF, use this DPI instead of file's.",
+        help="When the input file is an image, not a PDF, use this DPI instead "
+        "of the DPI claimed by the input file. If the input does not claim a "
+        "sensible DPI, this option will be required.",
    )
    parser.add_argument(
        '--output-type',
@@ -402,19 +404,6 @@ Online documentation is located at:
        help="Only rotate pages when confidence is above this value (arbitrary "
        "units reported by tesseract)",
    )
-    advanced.add_argument(
-        '--pdfa-image-compression',
-        choices=['auto', 'jpeg', 'lossless'],
-        default='auto',
-        help="Specify how to compress images in the output PDF/A. 'auto' lets "
-        "OCRmyPDF decide.  'jpeg' changes all grayscale and color images to "
-        "JPEG compression.  'lossless' uses PNG-style lossless compression "
-        "for all images.  Monochrome images are always compressed using a "
-        "lossless codec.  Compression settings "
-        "are applied to all pages, including those for which OCR was "
-        "skipped.  Not supported for --output-type=pdf ; that setting "
-        "preserves the original compression of all images.",
-    )
    advanced.add_argument(
        '--fast-web-view',
        type=numeric(float, 0),
--- a/src/ocrmypdf/helpers.py
+++ b/src/ocrmypdf/helpers.py
@@ -15,7 +15,16 @@ from contextlib import suppress
 from io import StringIO
 from math import isclose, isfinite
 from pathlib import Path
-from typing import Any, Generic, Sequence, SupportsFloat, SupportsRound, TypeVar
+from statistics import harmonic_mean
+from typing import (
+    Any,
+    Callable,
+    Generic,
+    Sequence,
+    SupportsFloat,
+    SupportsRound,
+    TypeVar,
+)

 import img2pdf
 import pikepdf
@@ -73,17 +82,38 @@ class Resolution(Generic[T]):
            return isfinite(self.x) and isfinite(self.y)
        return True

+    def to_scalar(self) -> float:
+        """Return the harmonic mean of x and y as a 1D approximation.
+
+        In most cases, Resolution is 2D, but typically it is "square" (x == y) and
+        can be approximated as a single number. When not square, the harmonic mean
+        is used to approximate the 2D resolution as a single number.
+        """
+        return harmonic_mean([self.x, self.y])
+
+    def _take_minmax(
+        self, vals: Iterable[Any], yvals: Iterable[Any], cmp: Callable
+    ) -> Resolution:
+        """Return a new Resolution object with the maximum resolution of inputs."""
+        if yvals is not None:
+            return Resolution(cmp(self.x, *vals), cmp(self.y, *yvals))
+        cmp_x, cmp_y = self.x, self.y
+        for x, y in vals:
+            cmp_x = cmp(x, cmp_x)
+            cmp_y = cmp(y, cmp_y)
+        return Resolution(cmp_x, cmp_y)
+
    def take_max(
        self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
    ) -> Resolution:
        """Return a new Resolution object with the maximum resolution of inputs."""
-        if yvals is not None:
-            return Resolution(max(self.x, *vals), max(self.y, *yvals))
-        max_x, max_y = self.x, self.y
-        for x, y in vals:
-            max_x = max(x, max_x)
-            max_y = max(y, max_y)
-        return Resolution(max_x, max_y)
+        return self._take_minmax(vals, yvals, max)
+
+    def take_min(
+        self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
+    ) -> Resolution:
+        """Return a new Resolution object with the minimum resolution of inputs."""
+        return self._take_minmax(vals, yvals, min)

    def flip_axis(self) -> Resolution[T]:
        """Return a new Resolution object with x and y swapped."""
@@ -95,11 +125,11 @@ class Resolution(Generic[T]):

    def __str__(self):
        """Return a string representation of the resolution."""
-        return f"{self.x:f}x{self.y:f}"
+        return f"{self.x:f}×{self.y:f}"

    def __repr__(self):  # pragma: no cover
        """Return a repr() of the resolution."""
-        return f"Resolution({self.x}x{self.y} dpi)"
+        return f"Resolution({self.x!r}, {self.y!r})"

    def __eq__(self, other):
        """Return True if the resolution is equal to another resolution."""
--- a/src/ocrmypdf/pdfinfo/info.py
+++ b/src/ocrmypdf/pdfinfo/info.py
@@ -420,12 +420,12 @@ class ImageInfo:
        return self._type

    @property
-    def width(self):
+    def width(self) -> int:
        """Width of the image in pixels."""
        return self._width

    @property
-    def height(self):
+    def height(self) -> int:
        """Height of the image in pixels."""
        return self._height

@@ -458,17 +458,24 @@ class ImageInfo:
        return self.dpi.is_finite and self.width >= 0 and self.height >= 0

    @property
-    def dpi(self):
+    def dpi(self) -> Resolution:
        """Dots per inch of the image.

        Calculated based on where and how the image is drawn in the PDF.
        """
        return _get_dpi(self._shorthand, (self._width, self._height))

+    @property
+    def printed_area(self) -> float:
+        """Physical area of the image in square inches."""
+        if not self.renderable:
+            return 0.0
+        return float(self.width * self.dpi.x * self.height * self.dpi.y)
+
    def __repr__(self):
        """Return a string representation of the image."""
        return (
-            f"<ImageInfo '{self.name}' {self.type_} {self.width}x{self.height} "
+            f"<ImageInfo '{self.name}' {self.type_} {self.width}×{self.height} "
            f"{self.color} {self.comp} {self.bpc} {self.enc} {self.dpi}>"
        )

@@ -747,12 +754,38 @@ def _pdf_pageinfo_concurrent(
    return pages


+class PageResolutionProfile(NamedTuple):
+    """Information about the resolutions of a page."""
+
+    weighted_dpi: float
+    """The weighted average DPI of the page, weighted by the area of each image."""
+
+    max_dpi: float
+    """The maximum DPI of an image on the page."""
+
+    average_to_max_dpi_ratio: float
+    """The average DPI of the page divided by the maximum DPI of the page.
+
+    This indicates the intensity of the resolution variation on the page.
+
+    If the average is 1.0 or close to 1.0, has all of its content at a uniform
+    resolution. If the average is much lower than 1.0, some content is at a
+    higher resolution than the rest of the page.
+    """
+
+    area_ratio: float
+    """The maximum-DPI area of the page divided by the total drawn area.
+
+    This indicates the prevalence of high-resolution content on the page.
+    """
+
+
 class PageInfo:
    """Information about type of contents on each page in a PDF."""

    _has_text: bool | None
    _has_vector: bool | None
-    _images: list[ImageInfo]
+    _images: list[ImageInfo] = []

    def __init__(
        self,
@@ -939,6 +972,44 @@ class PageInfo:
        else:
            return '1.5'

+    def page_dpi_profile(self) -> PageResolutionProfile | None:
+        """Return information about the DPIs of the page.
+
+        This is useful to detect pages with a small proportion of high-resolution
+        content that is forcing us to use a high DPI for the whole page. The ratio
+        is weighted by the area of each image. If images overlap, the overlapped
+        area counts.
+
+        Vector graphics and text are ignored.
+
+        Returns None if there is no meaningful DPI for the page.
+        """
+        image_dpis = [
+            image.dpi.to_scalar() for image in self._images if image.renderable
+        ]
+        image_areas = [image.printed_area for image in self._images if image.renderable]
+        total_drawn_area = sum(image_areas)
+        if total_drawn_area == 0:
+            return None
+
+        weights = [area / total_drawn_area for area in image_areas]
+        # Calculate harmonic mean of DPIs weighted by area
+        # When the minimum version is Python 3.10, change this to
+        # statistics.harmonic_mean with the weights parameter
+        # rather than doing it manually.
+        weighted_dpi = sum(weights) / sum(
+            weight / dpi for weight, dpi in zip(weights, image_dpis)
+        )
+        max_dpi = max(image_dpis)
+        dpi_average_max_ratio = weighted_dpi / max_dpi
+
+        arg_max_dpi = image_dpis.index(max_dpi)
+        max_area_ratio = image_areas[arg_max_dpi] / total_drawn_area
+
+        return PageResolutionProfile(
+            weighted_dpi, max_dpi, dpi_average_max_ratio, max_area_ratio
+        )
+
    def __repr__(self):
        """Return string representation."""
        return (
--- a/src/ocrmypdf/pluginspec.py
+++ b/src/ocrmypdf/pluginspec.py
@@ -351,7 +351,6 @@ def filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) -
        This hook will be called from child processes. Modifying global state
        will not affect the main process or other child processes.

-
    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """
@@ -466,7 +465,7 @@ def generate_pdfa(
    pdf_pages: list[Path],
    pdfmark: Path,
    output_file: Path,
-    compression: str,
+    context: PdfContext,
    pdf_version: str,
    pdfa_part: str,
    progressbar_class,
@@ -484,11 +483,7 @@ def generate_pdfa(
        pdfmark: A PostScript file intended for Ghostscript with details on
            how to perform the PDF/A conversion.
        output_file: The name of the desired output file.
-        compression: One of ``'jpeg'``, ``'lossless'``, ``''``. For ``'jpeg'``,
-            the PDF/A generator should convert all images to JPEG encoding where
-            possible. For lossless, all images should be converted to FlateEncode
-            (lossless PNG). If an empty string, the PDF generator should make its
-            own decisions about how to encode images.
+        context: The current context.
        pdf_version: The minimum PDF version that the output file should be.
            At its own discretion, the PDF/A generator may raise the version,
            but should not lower it.
@@ -514,6 +509,11 @@ def generate_pdfa(
    Note:
        This is a :ref:`firstresult hook<firstresult>`.

+    Note:
+        Before version 15.0.0, the ``context`` was not provided and ``compression``
+        was provided instead. Plugins should now read the context object to determine
+        if compression is requested.
+
    See Also:
        https://github.com/tqdm/tqdm
    """
--- a/tests/plugins/gs_feature_elision.py
+++ b/tests/plugins/gs_feature_elision.py
@@ -20,14 +20,14 @@ def run_append_stderr(*args, **kwargs):


@hookimpl
-def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
+def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
        mock.side_effect = run_append_stderr
        ghostscript.generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
-            compression=compression,
+            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=None,
--- a/tests/plugins/gs_pdfa_failure.py
+++ b/tests/plugins/gs_pdfa_failure.py
@@ -22,14 +22,14 @@ def run_rig_args(args, **kwargs):


@hookimpl
-def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
+def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
        mock.side_effect = run_rig_args
        ghostscript.generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
-            compression=compression,
+            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=None,
--- a/tests/plugins/gs_render_failure.py
+++ b/tests/plugins/gs_render_failure.py
@@ -17,14 +17,14 @@ def raise_gs_fail(*args, **kwargs):


@hookimpl
-def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
+def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
    with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
        mock.side_effect = raise_gs_fail
        ghostscript.generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
-            compression=compression,
+            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=None,
--- a/tests/plugins/gs_render_soft_error.py
+++ b/tests/plugins/gs_render_soft_error.py
@@ -23,7 +23,7 @@ def generate_pdfa(
    pdf_pages,
    pdfmark,
    output_file,
-    compression,
+    context,
    pdf_version,
    pdfa_part,
    stop_on_soft_error,
@@ -34,7 +34,7 @@ def generate_pdfa(
            pdf_pages=pdf_pages,
            pdfmark=pdfmark,
            output_file=output_file,
-            compression=compression,
+            context=context,
            pdf_version=pdf_version,
            pdfa_part=pdfa_part,
            progressbar_class=None,
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -86,7 +86,8 @@ def test_unset_metadata(output_type, field, resources, outpdf):
        'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd',
        'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp',
        'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr',
-        'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh'}
+        'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh',
+    }

    p = run_ocrmypdf(
        input_file,
@@ -352,17 +353,24 @@ XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'

 def test_prevent_gs_invalid_xml(resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
-    copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    # Inject a string with a trailing nul character into the DocumentInfo
    # dictionary of this PDF, as often occurs in practice.
-    with pikepdf.open(outdir / 'layers.rendered.pdf') as pike:
+    with pikepdf.open(resources / 'trivial.pdf') as pike:
        pike.Root.DocumentInfo = pikepdf.Dictionary(
            Title=b'String with trailing nul\x00'
        )
+        pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)

-    options = get_parser().parse_args(
-        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
+    _, options, _ = get_parser_options_plugins(
+        args=[
+            '-j',
+            '1',
+            '--output-type',
+            'pdfa-2',
+            'a.pdf',
+            'b.pdf',
+        ]
    )
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PdfContext(
@@ -387,17 +395,15 @@ def test_prevent_gs_invalid_xml(resources, outdir):

 def test_malformed_docinfo(caplog, resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
-    # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    with pikepdf.open(resources / 'trivial.pdf') as pike:
        pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
        pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)

-    options = get_parser().parse_args(
+    _, options, _ = get_parser_options_plugins(
        args=[
            '-j',
            '1',
-            '--continue-on-soft-render-error',
            '--output-type',
            'pdfa-2',
            'a.pdf',