diff --git a/docs/advanced.rst b/docs/advanced.rst index 963ff797..6e59b3fc 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -114,6 +114,41 @@ exceed a certain number of megapixels with ``--skip-big``. (A 300 DPI, # Allow 300 seconds for OCR; skip any page larger than 50 megapixels ocrmypdf --tesseract-timeout 300 --skip-big 50 bigfile.pdf output.pdf +OCR for huge images +------------------- + +Separate from these settings, Tesseract has internal limits on the size +of images it will process. If you issue +``--tesseract-downsample-large-images``, OCRmyPDF will downsample images +to fit Tesseract limits. (The limits are usually entered only for scanned +images of oversized media, such as large maps or blueprints exceeding +110 cm or 43 inches in either dimension, and at high DPI.) + +``--tesseract-downsample-above`` adjusts the threshold at which images +will be downsampled. By default, only images that exceed any of Tesseract's +internal limits are downsampled. + +You will also need to set ``--tesseract-timeout`` high enough to allow +for processing. + +Only the image sent for OCR is downsampled. The original image is +preserved. + +.. code-block:: bash + + # Allow 600 seconds for OCR on huge images + ocrmypdf --tesseract-timeout 600 \ + --tesseract-downsample-large-images \ + bigfile.pdf output.pdf + + # Downsample images above 5000 pixels on the longest dimension to + # 5000 pixels + ocrmypdf --tesseract-timeout 120 \ + --tesseract-downsample-large-images \ + --tesseract-downsample-above 5000 \ + bigfile.pdf output_downsampled_ocr.pdf + + Overriding default tesseract ---------------------------- diff --git a/misc/completion/ocrmypdf.bash b/misc/completion/ocrmypdf.bash index bbd6a5a4..b21c4c1e 100644 --- a/misc/completion/ocrmypdf.bash +++ b/misc/completion/ocrmypdf.bash @@ -6,53 +6,55 @@ set -o errexit __ocrmypdf_arguments() { - local arguments="--help (show help message) ---language (language(s) of the file to be OCRed) ---image-dpi (assume this DPI if input image DPI is unknown) ---output-type (select PDF output options) ---sidecar (write OCR to text file) ---version (print program version and exit) ---jobs (how many worker processes to use) ---quiet (suppress INFO messages) ---verbose (set verbosity level) ---title (set metadata) ---author (set metadata) ---subject (set metadata) ---keywords (set metadata) ---rotate-pages (rotate pages to correct orientation) ---remove-background (attempt to remove background from pages) ---deskew (fix small horizontal alignment skew) ---clean (clean document images before OCR) ---clean-final (clean document images and keep result) ---unpaper-args (a quoted string of arguments to pass to unpaper) ---oversample (oversample images to this DPI) ---remove-vectors (don\'t send vector objects to OCR) ---threshold (threshold images before OCR) ---force-ocr (OCR documents that already have printable text) ---skip-text (skip OCR on any pages that already contain text) ---redo-ocr (redo OCR on any pages that seem to have OCR already) + local arguments="\ +--help (show help message) +--language (language(s) of the file to be OCRed) +--image-dpi (assume this DPI if input image DPI is unknown) +--output-type (select PDF output options) +--sidecar (write OCR to text file) +--version (print program version and exit) +--jobs (how many worker processes to use) +--quiet (suppress INFO messages) +--verbose (set verbosity level) +--title (set metadata) +--author (set metadata) +--subject (set metadata) +--keywords (set metadata) +--rotate-pages (rotate pages to correct orientation) +--remove-background (attempt to remove background from pages) +--deskew (fix small horizontal alignment skew) +--clean (clean document images before OCR) +--clean-final (clean document images and keep result) +--unpaper-args (a quoted string of arguments to pass to unpaper) +--oversample (oversample images to this DPI) +--remove-vectors (don\'t send vector objects to OCR) +--threshold (threshold images before OCR) +--force-ocr (OCR documents that already have printable text) +--skip-text (skip OCR on any pages that already contain text) +--redo-ocr (redo OCR on any pages that seem to have OCR already) --invalidate-digital-signatures (remove digital signatures from PDF) ---skip-big (skip OCR on pages larger than this many MPixels) ---optimize (select optimization level) ---jpeg-quality (JPEG quality [0..100]) ---png-quality (PNG quality [0..100]) ---jbig2-lossy (enable lossy JBIG2 (see docs)) ---pages (apply OCR to only the specified pages) ---max-image-mpixels (image decompression bomb threshold) ---pdf-renderer (select PDF renderer options) ---rotate-pages-threshold (page rotation confidence) ---pdfa-image-compression (set PDF/A image compression options) ---fast-web-view (if file size if above this amount in MB linearize PDF) ---plugin (name of plugin to import) ---keep-temporary-files (keep temporary files (debug) ---tesseract-config (set custom tesseract config file) ---tesseract-pagesegmode (set tesseract --psm) ---tesseract-oem (set tesseract --oem) ---tesseract-thresholding (set tesseract image thresholding) ---tesseract-timeout (maximum number of seconds to wait for OCR) ---user-words (specify location of user words file) ---user-patterns (specify location of user patterns file) ---no-progress-bar (disable the progress bar) +--skip-big (skip OCR on pages larger than this many MPixels) +--optimize (select optimization level) +--jpeg-quality (JPEG quality [0..100]) +--png-quality (PNG quality [0..100]) +--jbig2-lossy (enable lossy JBIG2 (see docs)) +--pages (apply OCR to only the specified pages) +--max-image-mpixels (image decompression bomb threshold) +--pdf-renderer (select PDF renderer options) +--rotate-pages-threshold (page rotation confidence) +--pdfa-image-compression (set PDF/A image compression options) +--fast-web-view (if file size if above this amount in MB linearize PDF) +--plugin (name of plugin to import) +--keep-temporary-files (keep temporary files (debug) +--tesseract-config (set custom tesseract config file) +--tesseract-pagesegmode (set tesseract --psm) +--tesseract-oem (set tesseract --oem) +--tesseract-thresholding (set tesseract image thresholding) +--tesseract-timeout (maximum number of seconds to wait for OCR) +--user-words (specify location of user words file) +--user-patterns (specify location of user patterns file) +--no-progress-bar (disable the progress bar) +--color-conversion-strategy (select color conversion strategy) " COMPREPLY=( $( compgen -W "$arguments" -- "$cur") ) @@ -192,6 +194,20 @@ sauvola (use Sauvola thresholding)" fi } +__ocrmypdf_color-conversion-strategy() +{ + local choices="LeaveColorUnchanged (default) +CMYK (convert to CMYK) +Gray (convert to grayscale) +RGB (convert to RGB) +UseDeviceIndependentColor (convert with device independent color)" + + COMPREPLY=( $( compgen -W "$choices" -- "$cur") ) + # Remove description if only one completion exists + if [[ ${#COMPREPLY[*]} -eq 1 ]]; then + COMPREPLY=( ${COMPREPLY[0]%% *} ) + fi +} __ocrmypdf_check_previous() { @@ -251,6 +267,10 @@ __ocrmypdf_check_previous() _filedir return 0 ;; + --color-conversion-strategy) + __ocrmypdf_color-conversion-strategy + return 0 + ;; esac return 1 diff --git a/misc/completion/ocrmypdf.fish b/misc/completion/ocrmypdf.fish index 12b7ea9c..c279c6cb 100644 --- a/misc/completion/ocrmypdf.fish +++ b/misc/completion/ocrmypdf.fish @@ -129,4 +129,27 @@ complete -c ocrmypdf -r -l user-words -d "specify location of user words file" complete -c ocrmypdf -r -l user-patterns -d "specify location of user patterns file" complete -c ocrmypdf -x -l fast-web-view -d "if file size if above this amount in MB, linearize PDF" -complete -c ocrmypdf -x -a "(__fish_complete_suffix .pdf; __fish_complete_suffix .PDF; __fish_complete_suffix .jpg; __fish_complete_suffix .png)" +function __fish_ocrmypdf_color_conversion_strategy + echo -e "LeaveColorUnchanged\t"(_ "do not convert color spaces (default)") + echo -e "CMYK\t"(_ "convert all color spaces to CMYK") + echo -e "Gray\t"(_ "convert all color spaces to grayscale") + echo -e "RGB\t"(_ "convert all color spaces to RGB") + echo -e "UseDeviceIndependentColor\t"(_ "convert all color spaces to ICC-based color spaces") +end + +complete -c ocrmypdf -x -l color-conversion-strategy -a '(__fish_ocrmypdf_color_conversion_strategy)' -d "set color conversion strategy" + +function __fish_ocrmypdf_input_file_given + set -l tokens (commandline -opc) + for token in $tokens + if string match -q -r '^-' -- $token + continue + end + if test -f "$token" + return 0 + end + end + return 1 +end + +complete -c ocrmypdf -x -n 'not __fish_ocrmypdf_input_file_given' -a "(__fish_complete_suffix .pdf)" -d "input file" diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py index 001cbfbf..e5210b33 100644 --- a/src/ocrmypdf/_exec/ghostscript.py +++ b/src/ocrmypdf/_exec/ghostscript.py @@ -28,6 +28,17 @@ except AttributeError: # Pillow 9 shim Transpose = Image # type: ignore + +COLOR_CONVERSION_STRATEGIES = frozenset( + [ + 'CMYK', + 'Gray', + 'LeaveColorUnchanged', + 'RGB', + 'UseDeviceIndependentColor', + ] +) + log = logging.getLogger(__name__) @@ -151,6 +162,7 @@ def generate_pdfa( output_file: os.PathLike, *, compression: str, + color_conversion_strategy: str, pdf_version: str = '1.5', pdfa_part: str = '2', progressbar_class=None, @@ -200,16 +212,16 @@ def generate_pdfa( "-dBATCH", "-dNOPAUSE", "-dSAFER", - "-dCompatibilityLevel=" + str(pdf_version), + f"-dCompatibilityLevel={str(pdf_version)}", "-sDEVICE=pdfwrite", "-dAutoRotatePages=/None", - "-sColorConversionStrategy=" + strategy, + f"-sColorConversionStrategy={color_conversion_strategy}", ] + (['-dPDFSTOPONERROR'] if stop_on_error else []) + compression_args + [ "-dJPEGQ=95", - "-dPDFA=" + pdfa_part, + f"-dPDFA={pdfa_part}", "-dPDFACompatibilityPolicy=1", "-o", "-", diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 4d4eab66..01d23003 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -359,8 +359,12 @@ def is_ocr_required(page_context: PageContext) -> bool: def rasterize_preview(input_file: Path, page_context: PageContext) -> Path: """Generate a lower quality preview image.""" output_file = page_context.get_path('rasterize_preview.jpg') - canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options) - page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options) + canvas_dpi = Resolution(300.0, 300.0).take_min( + [get_canvas_square_dpi(page_context.pageinfo, page_context.options)] + ) + page_dpi = Resolution(300.0, 300.0).take_min( + [get_page_square_dpi(page_context.pageinfo, page_context.options)] + ) page_context.plugin_manager.hook.rasterize_pdf_page( input_file=input_file, output_file=output_file, @@ -490,6 +494,21 @@ def rasterize( canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options) page_dpi = get_page_square_dpi(pageinfo, page_context.options) + dpi_profile = pageinfo.page_dpi_profile() + if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8: + log.warning( + "Weight average DPI is %0.1f, max DPI is %0.1f. " + "The discrepancy may indicate a high detail region on this page, " + "but could also indicate a problem with the input PDF file. " + "An image will be rendered at %0.1f DPI.", + dpi_profile.weighted_dpi, + dpi_profile.max_dpi, + dpi_profile.weighted_dpi, + ) + canvas_dpi = page_dpi = Resolution( + dpi_profile.weighted_dpi, dpi_profile.weighted_dpi + ) + page_context.plugin_manager.hook.rasterize_pdf_page( input_file=input_file, output_file=output_file, @@ -792,7 +811,7 @@ def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext) - pdf_pages=[fix_docinfo_file], pdfmark=input_ps_stub, output_file=output_file, - compression=options.pdfa_image_compression, + context=context, pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3 progressbar_class=( context.plugin_manager.hook.get_progressbar_class() diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py index 52006faf..ef378e1a 100644 --- a/src/ocrmypdf/_validation.py +++ b/src/ocrmypdf/_validation.py @@ -205,16 +205,6 @@ def check_options_ocr_behavior(options: Namespace) -> None: options.pages = _pages_from_ranges(options.pages) -def check_options_advanced(options: Namespace) -> None: - if options.pdfa_image_compression != 'auto' and not options.output_type.startswith( - 'pdfa' - ): - log.warning( - "--pdfa-image-compression argument only applies when " - "--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'" - ) - - def check_options_metadata(options: Namespace) -> None: docinfo = [options.title, options.author, options.keywords, options.subject] for s in (m for m in docinfo if m): @@ -241,7 +231,6 @@ def _check_plugin_invariant_options(options: Namespace) -> None: check_options_sidecar(options) check_options_preprocessing(options) check_options_ocr_behavior(options) - check_options_advanced(options) check_options_pillow(options) diff --git a/src/ocrmypdf/builtin_plugins/ghostscript.py b/src/ocrmypdf/builtin_plugins/ghostscript.py index 44570e2f..895236b1 100644 --- a/src/ocrmypdf/builtin_plugins/ghostscript.py +++ b/src/ocrmypdf/builtin_plugins/ghostscript.py @@ -18,6 +18,33 @@ log = logging.getLogger(__name__) BLACKLISTED_GS_VERSIONS = frozenset() +@hookimpl +def add_options(parser): + gs = parser.add_argument_group("Ghostscript", "Advanced control of Ghostscript") + gs.add_argument( + '--color-conversion-strategy', + action='store', + type=str, + metavar='STRATEGY', + choices=ghostscript.COLOR_CONVERSION_STRATEGIES, + default='LeaveColorUnchanged', + help="Set Ghostscript color conversion strategy", + ) + gs.add_argument( + '--pdfa-image-compression', + choices=['auto', 'jpeg', 'lossless'], + default='auto', + help="Specify how to compress images in the output PDF/A. 'auto' lets " + "OCRmyPDF decide. 'jpeg' changes all grayscale and color images to " + "JPEG compression. 'lossless' uses PNG-style lossless compression " + "for all images. Monochrome images are always compressed using a " + "lossless codec. Compression settings " + "are applied to all pages, including those for which OCR was " + "skipped. Not supported for --output-type=pdf ; that setting " + "preserves the original compression of all images.", + ) + + @hookimpl def check_options(options): """Check that the options are valid for this plugin.""" @@ -37,6 +64,17 @@ def check_options(options): if options.output_type == 'pdfa': options.output_type = 'pdfa-2' + if options.color_conversion_strategy not in ghostscript.COLOR_CONVERSION_STRATEGIES: + raise ValueError( + f"Invalid color conversion strategy: {options.color_conversion_strategy}" + ) + if options.pdfa_image_compression != 'auto' and not options.output_type.startswith( + 'pdfa' + ): + log.warning( + "--pdfa-image-compression argument only applies when " + "--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'" + ) @hookimpl @@ -71,7 +109,7 @@ def generate_pdfa( pdf_pages, pdfmark, output_file, - compression, + context, pdf_version, pdfa_part, progressbar_class, @@ -81,7 +119,8 @@ def generate_pdfa( ghostscript.generate_pdfa( pdf_pages=[*pdf_pages, pdfmark], output_file=output_file, - compression=compression, + compression=context.options.pdfa_image_compression, + color_conversion_strategy=context.options.color_conversion_strategy, pdf_version=pdf_version, pdfa_part=pdfa_part, progressbar_class=progressbar_class, diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py index ad6b6d85..81692831 100644 --- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py +++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py @@ -30,7 +30,7 @@ def add_options(parser): action='append', metavar='CFG', default=[], - help="Additional Tesseract configuration files -- see documentation", + help="Additional Tesseract configuration files -- see documentation.", ) tess.add_argument( '--tesseract-pagesegmode', @@ -38,7 +38,7 @@ def add_options(parser): type=int, metavar='PSM', choices=range(0, 14), - help="Set Tesseract page segmentation mode (see tesseract --help)", + help="Set Tesseract page segmentation mode (see tesseract --help).", ) tess.add_argument( '--tesseract-oem', @@ -75,7 +75,10 @@ def add_options(parser): metavar='SECONDS', help=( "Give up on OCR after the timeout, but copy the preprocessed page " - "into the final output." + "into the final output. This timeout is only used when using Tesseract " + "for OCR. When Tesseract is used for other operations such as " + "deskewing and orientation, the timeout is controlled by " + "--tesseract-non-ocr-timeout." ), ) tess.add_argument( @@ -175,6 +178,15 @@ def validate(pdfinfo, options): tess_threads = int(os.environ['OMP_THREAD_LIMIT']) log.debug("Using Tesseract OpenMP thread limit %d", tess_threads) + if ( + options.tesseract_downsample_above != 32767 + and not options.tesseract_downsample_large_images + ): + log.warning( + "The --tesseract-downsample-above argument will have no effect unless " + "--tesseract-downsample-large-images is also given." + ) + @hookimpl def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image: diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py index 34c8973e..3d658d0e 100644 --- a/src/ocrmypdf/cli.py +++ b/src/ocrmypdf/cli.py @@ -177,7 +177,9 @@ Online documentation is located at: '--image-dpi', metavar='DPI', type=int, - help="For input image instead of PDF, use this DPI instead of file's.", + help="When the input file is an image, not a PDF, use this DPI instead " + "of the DPI claimed by the input file. If the input does not claim a " + "sensible DPI, this option will be required.", ) parser.add_argument( '--output-type', @@ -402,19 +404,6 @@ Online documentation is located at: help="Only rotate pages when confidence is above this value (arbitrary " "units reported by tesseract)", ) - advanced.add_argument( - '--pdfa-image-compression', - choices=['auto', 'jpeg', 'lossless'], - default='auto', - help="Specify how to compress images in the output PDF/A. 'auto' lets " - "OCRmyPDF decide. 'jpeg' changes all grayscale and color images to " - "JPEG compression. 'lossless' uses PNG-style lossless compression " - "for all images. Monochrome images are always compressed using a " - "lossless codec. Compression settings " - "are applied to all pages, including those for which OCR was " - "skipped. Not supported for --output-type=pdf ; that setting " - "preserves the original compression of all images.", - ) advanced.add_argument( '--fast-web-view', type=numeric(float, 0), diff --git a/src/ocrmypdf/helpers.py b/src/ocrmypdf/helpers.py index 2ab3392c..29532235 100644 --- a/src/ocrmypdf/helpers.py +++ b/src/ocrmypdf/helpers.py @@ -15,7 +15,16 @@ from contextlib import suppress from io import StringIO from math import isclose, isfinite from pathlib import Path -from typing import Any, Generic, Sequence, SupportsFloat, SupportsRound, TypeVar +from statistics import harmonic_mean +from typing import ( + Any, + Callable, + Generic, + Sequence, + SupportsFloat, + SupportsRound, + TypeVar, +) import img2pdf import pikepdf @@ -73,17 +82,38 @@ class Resolution(Generic[T]): return isfinite(self.x) and isfinite(self.y) return True + def to_scalar(self) -> float: + """Return the harmonic mean of x and y as a 1D approximation. + + In most cases, Resolution is 2D, but typically it is "square" (x == y) and + can be approximated as a single number. When not square, the harmonic mean + is used to approximate the 2D resolution as a single number. + """ + return harmonic_mean([self.x, self.y]) + + def _take_minmax( + self, vals: Iterable[Any], yvals: Iterable[Any], cmp: Callable + ) -> Resolution: + """Return a new Resolution object with the maximum resolution of inputs.""" + if yvals is not None: + return Resolution(cmp(self.x, *vals), cmp(self.y, *yvals)) + cmp_x, cmp_y = self.x, self.y + for x, y in vals: + cmp_x = cmp(x, cmp_x) + cmp_y = cmp(y, cmp_y) + return Resolution(cmp_x, cmp_y) + def take_max( self, vals: Iterable[Any], yvals: Iterable[Any] | None = None ) -> Resolution: """Return a new Resolution object with the maximum resolution of inputs.""" - if yvals is not None: - return Resolution(max(self.x, *vals), max(self.y, *yvals)) - max_x, max_y = self.x, self.y - for x, y in vals: - max_x = max(x, max_x) - max_y = max(y, max_y) - return Resolution(max_x, max_y) + return self._take_minmax(vals, yvals, max) + + def take_min( + self, vals: Iterable[Any], yvals: Iterable[Any] | None = None + ) -> Resolution: + """Return a new Resolution object with the minimum resolution of inputs.""" + return self._take_minmax(vals, yvals, min) def flip_axis(self) -> Resolution[T]: """Return a new Resolution object with x and y swapped.""" @@ -95,11 +125,11 @@ class Resolution(Generic[T]): def __str__(self): """Return a string representation of the resolution.""" - return f"{self.x:f}x{self.y:f}" + return f"{self.x:f}×{self.y:f}" def __repr__(self): # pragma: no cover """Return a repr() of the resolution.""" - return f"Resolution({self.x}x{self.y} dpi)" + return f"Resolution({self.x!r}, {self.y!r})" def __eq__(self, other): """Return True if the resolution is equal to another resolution.""" diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index 4122317b..56030e60 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -420,12 +420,12 @@ class ImageInfo: return self._type @property - def width(self): + def width(self) -> int: """Width of the image in pixels.""" return self._width @property - def height(self): + def height(self) -> int: """Height of the image in pixels.""" return self._height @@ -458,17 +458,24 @@ class ImageInfo: return self.dpi.is_finite and self.width >= 0 and self.height >= 0 @property - def dpi(self): + def dpi(self) -> Resolution: """Dots per inch of the image. Calculated based on where and how the image is drawn in the PDF. """ return _get_dpi(self._shorthand, (self._width, self._height)) + @property + def printed_area(self) -> float: + """Physical area of the image in square inches.""" + if not self.renderable: + return 0.0 + return float(self.width * self.dpi.x * self.height * self.dpi.y) + def __repr__(self): """Return a string representation of the image.""" return ( - f"" ) @@ -747,12 +754,38 @@ def _pdf_pageinfo_concurrent( return pages +class PageResolutionProfile(NamedTuple): + """Information about the resolutions of a page.""" + + weighted_dpi: float + """The weighted average DPI of the page, weighted by the area of each image.""" + + max_dpi: float + """The maximum DPI of an image on the page.""" + + average_to_max_dpi_ratio: float + """The average DPI of the page divided by the maximum DPI of the page. + + This indicates the intensity of the resolution variation on the page. + + If the average is 1.0 or close to 1.0, has all of its content at a uniform + resolution. If the average is much lower than 1.0, some content is at a + higher resolution than the rest of the page. + """ + + area_ratio: float + """The maximum-DPI area of the page divided by the total drawn area. + + This indicates the prevalence of high-resolution content on the page. + """ + + class PageInfo: """Information about type of contents on each page in a PDF.""" _has_text: bool | None _has_vector: bool | None - _images: list[ImageInfo] + _images: list[ImageInfo] = [] def __init__( self, @@ -939,6 +972,44 @@ class PageInfo: else: return '1.5' + def page_dpi_profile(self) -> PageResolutionProfile | None: + """Return information about the DPIs of the page. + + This is useful to detect pages with a small proportion of high-resolution + content that is forcing us to use a high DPI for the whole page. The ratio + is weighted by the area of each image. If images overlap, the overlapped + area counts. + + Vector graphics and text are ignored. + + Returns None if there is no meaningful DPI for the page. + """ + image_dpis = [ + image.dpi.to_scalar() for image in self._images if image.renderable + ] + image_areas = [image.printed_area for image in self._images if image.renderable] + total_drawn_area = sum(image_areas) + if total_drawn_area == 0: + return None + + weights = [area / total_drawn_area for area in image_areas] + # Calculate harmonic mean of DPIs weighted by area + # When the minimum version is Python 3.10, change this to + # statistics.harmonic_mean with the weights parameter + # rather than doing it manually. + weighted_dpi = sum(weights) / sum( + weight / dpi for weight, dpi in zip(weights, image_dpis) + ) + max_dpi = max(image_dpis) + dpi_average_max_ratio = weighted_dpi / max_dpi + + arg_max_dpi = image_dpis.index(max_dpi) + max_area_ratio = image_areas[arg_max_dpi] / total_drawn_area + + return PageResolutionProfile( + weighted_dpi, max_dpi, dpi_average_max_ratio, max_area_ratio + ) + def __repr__(self): """Return string representation.""" return ( diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index 46f2944e..00c8a4df 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -351,7 +351,6 @@ def filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) - This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. - Note: This is a :ref:`firstresult hook`. """ @@ -466,7 +465,7 @@ def generate_pdfa( pdf_pages: list[Path], pdfmark: Path, output_file: Path, - compression: str, + context: PdfContext, pdf_version: str, pdfa_part: str, progressbar_class, @@ -484,11 +483,7 @@ def generate_pdfa( pdfmark: A PostScript file intended for Ghostscript with details on how to perform the PDF/A conversion. output_file: The name of the desired output file. - compression: One of ``'jpeg'``, ``'lossless'``, ``''``. For ``'jpeg'``, - the PDF/A generator should convert all images to JPEG encoding where - possible. For lossless, all images should be converted to FlateEncode - (lossless PNG). If an empty string, the PDF generator should make its - own decisions about how to encode images. + context: The current context. pdf_version: The minimum PDF version that the output file should be. At its own discretion, the PDF/A generator may raise the version, but should not lower it. @@ -514,6 +509,11 @@ def generate_pdfa( Note: This is a :ref:`firstresult hook`. + Note: + Before version 15.0.0, the ``context`` was not provided and ``compression`` + was provided instead. Plugins should now read the context object to determine + if compression is requested. + See Also: https://github.com/tqdm/tqdm """ diff --git a/tests/plugins/gs_feature_elision.py b/tests/plugins/gs_feature_elision.py index dcd4c849..5f914dde 100644 --- a/tests/plugins/gs_feature_elision.py +++ b/tests/plugins/gs_feature_elision.py @@ -20,14 +20,14 @@ def run_append_stderr(*args, **kwargs): @hookimpl -def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part): +def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part): with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock: mock.side_effect = run_append_stderr ghostscript.generate_pdfa( pdf_pages=pdf_pages, pdfmark=pdfmark, output_file=output_file, - compression=compression, + context=context, pdf_version=pdf_version, pdfa_part=pdfa_part, progressbar_class=None, diff --git a/tests/plugins/gs_pdfa_failure.py b/tests/plugins/gs_pdfa_failure.py index e9ec9697..9dc742e5 100644 --- a/tests/plugins/gs_pdfa_failure.py +++ b/tests/plugins/gs_pdfa_failure.py @@ -22,14 +22,14 @@ def run_rig_args(args, **kwargs): @hookimpl -def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part): +def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part): with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock: mock.side_effect = run_rig_args ghostscript.generate_pdfa( pdf_pages=pdf_pages, pdfmark=pdfmark, output_file=output_file, - compression=compression, + context=context, pdf_version=pdf_version, pdfa_part=pdfa_part, progressbar_class=None, diff --git a/tests/plugins/gs_render_failure.py b/tests/plugins/gs_render_failure.py index 56ec3905..4d955354 100644 --- a/tests/plugins/gs_render_failure.py +++ b/tests/plugins/gs_render_failure.py @@ -17,14 +17,14 @@ def raise_gs_fail(*args, **kwargs): @hookimpl -def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part): +def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part): with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock: mock.side_effect = raise_gs_fail ghostscript.generate_pdfa( pdf_pages=pdf_pages, pdfmark=pdfmark, output_file=output_file, - compression=compression, + context=context, pdf_version=pdf_version, pdfa_part=pdfa_part, progressbar_class=None, diff --git a/tests/plugins/gs_render_soft_error.py b/tests/plugins/gs_render_soft_error.py index f80e0401..e276194a 100644 --- a/tests/plugins/gs_render_soft_error.py +++ b/tests/plugins/gs_render_soft_error.py @@ -23,7 +23,7 @@ def generate_pdfa( pdf_pages, pdfmark, output_file, - compression, + context, pdf_version, pdfa_part, stop_on_soft_error, @@ -34,7 +34,7 @@ def generate_pdfa( pdf_pages=pdf_pages, pdfmark=pdfmark, output_file=output_file, - compression=compression, + context=context, pdf_version=pdf_version, pdfa_part=pdfa_part, progressbar_class=None, diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 3ef661bf..48a24854 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -86,7 +86,8 @@ def test_unset_metadata(output_type, field, resources, outpdf): 'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd', 'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp', 'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr', - 'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh'} + 'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh', + } p = run_ocrmypdf( input_file, @@ -352,17 +353,24 @@ XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' def test_prevent_gs_invalid_xml(resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') - copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') # Inject a string with a trailing nul character into the DocumentInfo # dictionary of this PDF, as often occurs in practice. - with pikepdf.open(outdir / 'layers.rendered.pdf') as pike: + with pikepdf.open(resources / 'trivial.pdf') as pike: pike.Root.DocumentInfo = pikepdf.Dictionary( Title=b'String with trailing nul\x00' ) + pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) - options = get_parser().parse_args( - args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] + _, options, _ = get_parser_options_plugins( + args=[ + '-j', + '1', + '--output-type', + 'pdfa-2', + 'a.pdf', + 'b.pdf', + ] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PdfContext( @@ -387,17 +395,15 @@ def test_prevent_gs_invalid_xml(resources, outdir): def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') - # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') with pikepdf.open(resources / 'trivial.pdf') as pike: pike.trailer.Info = pikepdf.Stream(pike, b"") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) - options = get_parser().parse_args( + _, options, _ = get_parser_options_plugins( args=[ '-j', '1', - '--continue-on-soft-render-error', '--output-type', 'pdfa-2', 'a.pdf',