mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-06 21:56:21 -04:00
Merge branch 'feature/fix-raster-dpi-too-high' into v15
This commit is contained in:
@@ -114,6 +114,41 @@ exceed a certain number of megapixels with ``--skip-big``. (A 300 DPI,
|
||||
# Allow 300 seconds for OCR; skip any page larger than 50 megapixels
|
||||
ocrmypdf --tesseract-timeout 300 --skip-big 50 bigfile.pdf output.pdf
|
||||
|
||||
OCR for huge images
|
||||
-------------------
|
||||
|
||||
Separate from these settings, Tesseract has internal limits on the size
|
||||
of images it will process. If you issue
|
||||
``--tesseract-downsample-large-images``, OCRmyPDF will downsample images
|
||||
to fit Tesseract limits. (The limits are usually entered only for scanned
|
||||
images of oversized media, such as large maps or blueprints exceeding
|
||||
110 cm or 43 inches in either dimension, and at high DPI.)
|
||||
|
||||
``--tesseract-downsample-above`` adjusts the threshold at which images
|
||||
will be downsampled. By default, only images that exceed any of Tesseract's
|
||||
internal limits are downsampled.
|
||||
|
||||
You will also need to set ``--tesseract-timeout`` high enough to allow
|
||||
for processing.
|
||||
|
||||
Only the image sent for OCR is downsampled. The original image is
|
||||
preserved.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Allow 600 seconds for OCR on huge images
|
||||
ocrmypdf --tesseract-timeout 600 \
|
||||
--tesseract-downsample-large-images \
|
||||
bigfile.pdf output.pdf
|
||||
|
||||
# Downsample images above 5000 pixels on the longest dimension to
|
||||
# 5000 pixels
|
||||
ocrmypdf --tesseract-timeout 120 \
|
||||
--tesseract-downsample-large-images \
|
||||
--tesseract-downsample-above 5000 \
|
||||
bigfile.pdf output_downsampled_ocr.pdf
|
||||
|
||||
|
||||
Overriding default tesseract
|
||||
----------------------------
|
||||
|
||||
|
||||
@@ -6,53 +6,55 @@ set -o errexit
|
||||
|
||||
__ocrmypdf_arguments()
|
||||
{
|
||||
local arguments="--help (show help message)
|
||||
--language (language(s) of the file to be OCRed)
|
||||
--image-dpi (assume this DPI if input image DPI is unknown)
|
||||
--output-type (select PDF output options)
|
||||
--sidecar (write OCR to text file)
|
||||
--version (print program version and exit)
|
||||
--jobs (how many worker processes to use)
|
||||
--quiet (suppress INFO messages)
|
||||
--verbose (set verbosity level)
|
||||
--title (set metadata)
|
||||
--author (set metadata)
|
||||
--subject (set metadata)
|
||||
--keywords (set metadata)
|
||||
--rotate-pages (rotate pages to correct orientation)
|
||||
--remove-background (attempt to remove background from pages)
|
||||
--deskew (fix small horizontal alignment skew)
|
||||
--clean (clean document images before OCR)
|
||||
--clean-final (clean document images and keep result)
|
||||
--unpaper-args (a quoted string of arguments to pass to unpaper)
|
||||
--oversample (oversample images to this DPI)
|
||||
--remove-vectors (don\'t send vector objects to OCR)
|
||||
--threshold (threshold images before OCR)
|
||||
--force-ocr (OCR documents that already have printable text)
|
||||
--skip-text (skip OCR on any pages that already contain text)
|
||||
--redo-ocr (redo OCR on any pages that seem to have OCR already)
|
||||
local arguments="\
|
||||
--help (show help message)
|
||||
--language (language(s) of the file to be OCRed)
|
||||
--image-dpi (assume this DPI if input image DPI is unknown)
|
||||
--output-type (select PDF output options)
|
||||
--sidecar (write OCR to text file)
|
||||
--version (print program version and exit)
|
||||
--jobs (how many worker processes to use)
|
||||
--quiet (suppress INFO messages)
|
||||
--verbose (set verbosity level)
|
||||
--title (set metadata)
|
||||
--author (set metadata)
|
||||
--subject (set metadata)
|
||||
--keywords (set metadata)
|
||||
--rotate-pages (rotate pages to correct orientation)
|
||||
--remove-background (attempt to remove background from pages)
|
||||
--deskew (fix small horizontal alignment skew)
|
||||
--clean (clean document images before OCR)
|
||||
--clean-final (clean document images and keep result)
|
||||
--unpaper-args (a quoted string of arguments to pass to unpaper)
|
||||
--oversample (oversample images to this DPI)
|
||||
--remove-vectors (don\'t send vector objects to OCR)
|
||||
--threshold (threshold images before OCR)
|
||||
--force-ocr (OCR documents that already have printable text)
|
||||
--skip-text (skip OCR on any pages that already contain text)
|
||||
--redo-ocr (redo OCR on any pages that seem to have OCR already)
|
||||
--invalidate-digital-signatures (remove digital signatures from PDF)
|
||||
--skip-big (skip OCR on pages larger than this many MPixels)
|
||||
--optimize (select optimization level)
|
||||
--jpeg-quality (JPEG quality [0..100])
|
||||
--png-quality (PNG quality [0..100])
|
||||
--jbig2-lossy (enable lossy JBIG2 (see docs))
|
||||
--pages (apply OCR to only the specified pages)
|
||||
--max-image-mpixels (image decompression bomb threshold)
|
||||
--pdf-renderer (select PDF renderer options)
|
||||
--rotate-pages-threshold (page rotation confidence)
|
||||
--pdfa-image-compression (set PDF/A image compression options)
|
||||
--fast-web-view (if file size if above this amount in MB linearize PDF)
|
||||
--plugin (name of plugin to import)
|
||||
--keep-temporary-files (keep temporary files (debug)
|
||||
--tesseract-config (set custom tesseract config file)
|
||||
--tesseract-pagesegmode (set tesseract --psm)
|
||||
--tesseract-oem (set tesseract --oem)
|
||||
--tesseract-thresholding (set tesseract image thresholding)
|
||||
--tesseract-timeout (maximum number of seconds to wait for OCR)
|
||||
--user-words (specify location of user words file)
|
||||
--user-patterns (specify location of user patterns file)
|
||||
--no-progress-bar (disable the progress bar)
|
||||
--skip-big (skip OCR on pages larger than this many MPixels)
|
||||
--optimize (select optimization level)
|
||||
--jpeg-quality (JPEG quality [0..100])
|
||||
--png-quality (PNG quality [0..100])
|
||||
--jbig2-lossy (enable lossy JBIG2 (see docs))
|
||||
--pages (apply OCR to only the specified pages)
|
||||
--max-image-mpixels (image decompression bomb threshold)
|
||||
--pdf-renderer (select PDF renderer options)
|
||||
--rotate-pages-threshold (page rotation confidence)
|
||||
--pdfa-image-compression (set PDF/A image compression options)
|
||||
--fast-web-view (if file size if above this amount in MB linearize PDF)
|
||||
--plugin (name of plugin to import)
|
||||
--keep-temporary-files (keep temporary files (debug)
|
||||
--tesseract-config (set custom tesseract config file)
|
||||
--tesseract-pagesegmode (set tesseract --psm)
|
||||
--tesseract-oem (set tesseract --oem)
|
||||
--tesseract-thresholding (set tesseract image thresholding)
|
||||
--tesseract-timeout (maximum number of seconds to wait for OCR)
|
||||
--user-words (specify location of user words file)
|
||||
--user-patterns (specify location of user patterns file)
|
||||
--no-progress-bar (disable the progress bar)
|
||||
--color-conversion-strategy (select color conversion strategy)
|
||||
"
|
||||
|
||||
COMPREPLY=( $( compgen -W "$arguments" -- "$cur") )
|
||||
@@ -192,6 +194,20 @@ sauvola (use Sauvola thresholding)"
|
||||
fi
|
||||
}
|
||||
|
||||
__ocrmypdf_color-conversion-strategy()
|
||||
{
|
||||
local choices="LeaveColorUnchanged (default)
|
||||
CMYK (convert to CMYK)
|
||||
Gray (convert to grayscale)
|
||||
RGB (convert to RGB)
|
||||
UseDeviceIndependentColor (convert with device independent color)"
|
||||
|
||||
COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
|
||||
# Remove description if only one completion exists
|
||||
if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
|
||||
COMPREPLY=( ${COMPREPLY[0]%% *} )
|
||||
fi
|
||||
}
|
||||
|
||||
__ocrmypdf_check_previous()
|
||||
{
|
||||
@@ -251,6 +267,10 @@ __ocrmypdf_check_previous()
|
||||
_filedir
|
||||
return 0
|
||||
;;
|
||||
--color-conversion-strategy)
|
||||
__ocrmypdf_color-conversion-strategy
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
return 1
|
||||
|
||||
@@ -129,4 +129,27 @@ complete -c ocrmypdf -r -l user-words -d "specify location of user words file"
|
||||
complete -c ocrmypdf -r -l user-patterns -d "specify location of user patterns file"
|
||||
complete -c ocrmypdf -x -l fast-web-view -d "if file size if above this amount in MB, linearize PDF"
|
||||
|
||||
complete -c ocrmypdf -x -a "(__fish_complete_suffix .pdf; __fish_complete_suffix .PDF; __fish_complete_suffix .jpg; __fish_complete_suffix .png)"
|
||||
function __fish_ocrmypdf_color_conversion_strategy
|
||||
echo -e "LeaveColorUnchanged\t"(_ "do not convert color spaces (default)")
|
||||
echo -e "CMYK\t"(_ "convert all color spaces to CMYK")
|
||||
echo -e "Gray\t"(_ "convert all color spaces to grayscale")
|
||||
echo -e "RGB\t"(_ "convert all color spaces to RGB")
|
||||
echo -e "UseDeviceIndependentColor\t"(_ "convert all color spaces to ICC-based color spaces")
|
||||
end
|
||||
|
||||
complete -c ocrmypdf -x -l color-conversion-strategy -a '(__fish_ocrmypdf_color_conversion_strategy)' -d "set color conversion strategy"
|
||||
|
||||
function __fish_ocrmypdf_input_file_given
|
||||
set -l tokens (commandline -opc)
|
||||
for token in $tokens
|
||||
if string match -q -r '^-' -- $token
|
||||
continue
|
||||
end
|
||||
if test -f "$token"
|
||||
return 0
|
||||
end
|
||||
end
|
||||
return 1
|
||||
end
|
||||
|
||||
complete -c ocrmypdf -x -n 'not __fish_ocrmypdf_input_file_given' -a "(__fish_complete_suffix .pdf)" -d "input file"
|
||||
|
||||
@@ -28,6 +28,17 @@ except AttributeError:
|
||||
# Pillow 9 shim
|
||||
Transpose = Image # type: ignore
|
||||
|
||||
|
||||
COLOR_CONVERSION_STRATEGIES = frozenset(
|
||||
[
|
||||
'CMYK',
|
||||
'Gray',
|
||||
'LeaveColorUnchanged',
|
||||
'RGB',
|
||||
'UseDeviceIndependentColor',
|
||||
]
|
||||
)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -151,6 +162,7 @@ def generate_pdfa(
|
||||
output_file: os.PathLike,
|
||||
*,
|
||||
compression: str,
|
||||
color_conversion_strategy: str,
|
||||
pdf_version: str = '1.5',
|
||||
pdfa_part: str = '2',
|
||||
progressbar_class=None,
|
||||
@@ -200,16 +212,16 @@ def generate_pdfa(
|
||||
"-dBATCH",
|
||||
"-dNOPAUSE",
|
||||
"-dSAFER",
|
||||
"-dCompatibilityLevel=" + str(pdf_version),
|
||||
f"-dCompatibilityLevel={str(pdf_version)}",
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dAutoRotatePages=/None",
|
||||
"-sColorConversionStrategy=" + strategy,
|
||||
f"-sColorConversionStrategy={color_conversion_strategy}",
|
||||
]
|
||||
+ (['-dPDFSTOPONERROR'] if stop_on_error else [])
|
||||
+ compression_args
|
||||
+ [
|
||||
"-dJPEGQ=95",
|
||||
"-dPDFA=" + pdfa_part,
|
||||
f"-dPDFA={pdfa_part}",
|
||||
"-dPDFACompatibilityPolicy=1",
|
||||
"-o",
|
||||
"-",
|
||||
|
||||
@@ -359,8 +359,12 @@ def is_ocr_required(page_context: PageContext) -> bool:
|
||||
def rasterize_preview(input_file: Path, page_context: PageContext) -> Path:
|
||||
"""Generate a lower quality preview image."""
|
||||
output_file = page_context.get_path('rasterize_preview.jpg')
|
||||
canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options)
|
||||
page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
|
||||
canvas_dpi = Resolution(300.0, 300.0).take_min(
|
||||
[get_canvas_square_dpi(page_context.pageinfo, page_context.options)]
|
||||
)
|
||||
page_dpi = Resolution(300.0, 300.0).take_min(
|
||||
[get_page_square_dpi(page_context.pageinfo, page_context.options)]
|
||||
)
|
||||
page_context.plugin_manager.hook.rasterize_pdf_page(
|
||||
input_file=input_file,
|
||||
output_file=output_file,
|
||||
@@ -490,6 +494,21 @@ def rasterize(
|
||||
canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options)
|
||||
page_dpi = get_page_square_dpi(pageinfo, page_context.options)
|
||||
|
||||
dpi_profile = pageinfo.page_dpi_profile()
|
||||
if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:
|
||||
log.warning(
|
||||
"Weight average DPI is %0.1f, max DPI is %0.1f. "
|
||||
"The discrepancy may indicate a high detail region on this page, "
|
||||
"but could also indicate a problem with the input PDF file. "
|
||||
"An image will be rendered at %0.1f DPI.",
|
||||
dpi_profile.weighted_dpi,
|
||||
dpi_profile.max_dpi,
|
||||
dpi_profile.weighted_dpi,
|
||||
)
|
||||
canvas_dpi = page_dpi = Resolution(
|
||||
dpi_profile.weighted_dpi, dpi_profile.weighted_dpi
|
||||
)
|
||||
|
||||
page_context.plugin_manager.hook.rasterize_pdf_page(
|
||||
input_file=input_file,
|
||||
output_file=output_file,
|
||||
@@ -792,7 +811,7 @@ def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext) -
|
||||
pdf_pages=[fix_docinfo_file],
|
||||
pdfmark=input_ps_stub,
|
||||
output_file=output_file,
|
||||
compression=options.pdfa_image_compression,
|
||||
context=context,
|
||||
pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3
|
||||
progressbar_class=(
|
||||
context.plugin_manager.hook.get_progressbar_class()
|
||||
|
||||
@@ -205,16 +205,6 @@ def check_options_ocr_behavior(options: Namespace) -> None:
|
||||
options.pages = _pages_from_ranges(options.pages)
|
||||
|
||||
|
||||
def check_options_advanced(options: Namespace) -> None:
|
||||
if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
|
||||
'pdfa'
|
||||
):
|
||||
log.warning(
|
||||
"--pdfa-image-compression argument only applies when "
|
||||
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
|
||||
)
|
||||
|
||||
|
||||
def check_options_metadata(options: Namespace) -> None:
|
||||
docinfo = [options.title, options.author, options.keywords, options.subject]
|
||||
for s in (m for m in docinfo if m):
|
||||
@@ -241,7 +231,6 @@ def _check_plugin_invariant_options(options: Namespace) -> None:
|
||||
check_options_sidecar(options)
|
||||
check_options_preprocessing(options)
|
||||
check_options_ocr_behavior(options)
|
||||
check_options_advanced(options)
|
||||
check_options_pillow(options)
|
||||
|
||||
|
||||
|
||||
@@ -18,6 +18,33 @@ log = logging.getLogger(__name__)
|
||||
BLACKLISTED_GS_VERSIONS = frozenset()
|
||||
|
||||
|
||||
@hookimpl
|
||||
def add_options(parser):
|
||||
gs = parser.add_argument_group("Ghostscript", "Advanced control of Ghostscript")
|
||||
gs.add_argument(
|
||||
'--color-conversion-strategy',
|
||||
action='store',
|
||||
type=str,
|
||||
metavar='STRATEGY',
|
||||
choices=ghostscript.COLOR_CONVERSION_STRATEGIES,
|
||||
default='LeaveColorUnchanged',
|
||||
help="Set Ghostscript color conversion strategy",
|
||||
)
|
||||
gs.add_argument(
|
||||
'--pdfa-image-compression',
|
||||
choices=['auto', 'jpeg', 'lossless'],
|
||||
default='auto',
|
||||
help="Specify how to compress images in the output PDF/A. 'auto' lets "
|
||||
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
|
||||
"JPEG compression. 'lossless' uses PNG-style lossless compression "
|
||||
"for all images. Monochrome images are always compressed using a "
|
||||
"lossless codec. Compression settings "
|
||||
"are applied to all pages, including those for which OCR was "
|
||||
"skipped. Not supported for --output-type=pdf ; that setting "
|
||||
"preserves the original compression of all images.",
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def check_options(options):
|
||||
"""Check that the options are valid for this plugin."""
|
||||
@@ -37,6 +64,17 @@ def check_options(options):
|
||||
|
||||
if options.output_type == 'pdfa':
|
||||
options.output_type = 'pdfa-2'
|
||||
if options.color_conversion_strategy not in ghostscript.COLOR_CONVERSION_STRATEGIES:
|
||||
raise ValueError(
|
||||
f"Invalid color conversion strategy: {options.color_conversion_strategy}"
|
||||
)
|
||||
if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
|
||||
'pdfa'
|
||||
):
|
||||
log.warning(
|
||||
"--pdfa-image-compression argument only applies when "
|
||||
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
@@ -71,7 +109,7 @@ def generate_pdfa(
|
||||
pdf_pages,
|
||||
pdfmark,
|
||||
output_file,
|
||||
compression,
|
||||
context,
|
||||
pdf_version,
|
||||
pdfa_part,
|
||||
progressbar_class,
|
||||
@@ -81,7 +119,8 @@ def generate_pdfa(
|
||||
ghostscript.generate_pdfa(
|
||||
pdf_pages=[*pdf_pages, pdfmark],
|
||||
output_file=output_file,
|
||||
compression=compression,
|
||||
compression=context.options.pdfa_image_compression,
|
||||
color_conversion_strategy=context.options.color_conversion_strategy,
|
||||
pdf_version=pdf_version,
|
||||
pdfa_part=pdfa_part,
|
||||
progressbar_class=progressbar_class,
|
||||
|
||||
@@ -30,7 +30,7 @@ def add_options(parser):
|
||||
action='append',
|
||||
metavar='CFG',
|
||||
default=[],
|
||||
help="Additional Tesseract configuration files -- see documentation",
|
||||
help="Additional Tesseract configuration files -- see documentation.",
|
||||
)
|
||||
tess.add_argument(
|
||||
'--tesseract-pagesegmode',
|
||||
@@ -38,7 +38,7 @@ def add_options(parser):
|
||||
type=int,
|
||||
metavar='PSM',
|
||||
choices=range(0, 14),
|
||||
help="Set Tesseract page segmentation mode (see tesseract --help)",
|
||||
help="Set Tesseract page segmentation mode (see tesseract --help).",
|
||||
)
|
||||
tess.add_argument(
|
||||
'--tesseract-oem',
|
||||
@@ -75,7 +75,10 @@ def add_options(parser):
|
||||
metavar='SECONDS',
|
||||
help=(
|
||||
"Give up on OCR after the timeout, but copy the preprocessed page "
|
||||
"into the final output."
|
||||
"into the final output. This timeout is only used when using Tesseract "
|
||||
"for OCR. When Tesseract is used for other operations such as "
|
||||
"deskewing and orientation, the timeout is controlled by "
|
||||
"--tesseract-non-ocr-timeout."
|
||||
),
|
||||
)
|
||||
tess.add_argument(
|
||||
@@ -175,6 +178,15 @@ def validate(pdfinfo, options):
|
||||
tess_threads = int(os.environ['OMP_THREAD_LIMIT'])
|
||||
log.debug("Using Tesseract OpenMP thread limit %d", tess_threads)
|
||||
|
||||
if (
|
||||
options.tesseract_downsample_above != 32767
|
||||
and not options.tesseract_downsample_large_images
|
||||
):
|
||||
log.warning(
|
||||
"The --tesseract-downsample-above argument will have no effect unless "
|
||||
"--tesseract-downsample-large-images is also given."
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:
|
||||
|
||||
@@ -177,7 +177,9 @@ Online documentation is located at:
|
||||
'--image-dpi',
|
||||
metavar='DPI',
|
||||
type=int,
|
||||
help="For input image instead of PDF, use this DPI instead of file's.",
|
||||
help="When the input file is an image, not a PDF, use this DPI instead "
|
||||
"of the DPI claimed by the input file. If the input does not claim a "
|
||||
"sensible DPI, this option will be required.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-type',
|
||||
@@ -402,19 +404,6 @@ Online documentation is located at:
|
||||
help="Only rotate pages when confidence is above this value (arbitrary "
|
||||
"units reported by tesseract)",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdfa-image-compression',
|
||||
choices=['auto', 'jpeg', 'lossless'],
|
||||
default='auto',
|
||||
help="Specify how to compress images in the output PDF/A. 'auto' lets "
|
||||
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
|
||||
"JPEG compression. 'lossless' uses PNG-style lossless compression "
|
||||
"for all images. Monochrome images are always compressed using a "
|
||||
"lossless codec. Compression settings "
|
||||
"are applied to all pages, including those for which OCR was "
|
||||
"skipped. Not supported for --output-type=pdf ; that setting "
|
||||
"preserves the original compression of all images.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--fast-web-view',
|
||||
type=numeric(float, 0),
|
||||
|
||||
@@ -15,7 +15,16 @@ from contextlib import suppress
|
||||
from io import StringIO
|
||||
from math import isclose, isfinite
|
||||
from pathlib import Path
|
||||
from typing import Any, Generic, Sequence, SupportsFloat, SupportsRound, TypeVar
|
||||
from statistics import harmonic_mean
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Generic,
|
||||
Sequence,
|
||||
SupportsFloat,
|
||||
SupportsRound,
|
||||
TypeVar,
|
||||
)
|
||||
|
||||
import img2pdf
|
||||
import pikepdf
|
||||
@@ -73,17 +82,38 @@ class Resolution(Generic[T]):
|
||||
return isfinite(self.x) and isfinite(self.y)
|
||||
return True
|
||||
|
||||
def to_scalar(self) -> float:
|
||||
"""Return the harmonic mean of x and y as a 1D approximation.
|
||||
|
||||
In most cases, Resolution is 2D, but typically it is "square" (x == y) and
|
||||
can be approximated as a single number. When not square, the harmonic mean
|
||||
is used to approximate the 2D resolution as a single number.
|
||||
"""
|
||||
return harmonic_mean([self.x, self.y])
|
||||
|
||||
def _take_minmax(
|
||||
self, vals: Iterable[Any], yvals: Iterable[Any], cmp: Callable
|
||||
) -> Resolution:
|
||||
"""Return a new Resolution object with the maximum resolution of inputs."""
|
||||
if yvals is not None:
|
||||
return Resolution(cmp(self.x, *vals), cmp(self.y, *yvals))
|
||||
cmp_x, cmp_y = self.x, self.y
|
||||
for x, y in vals:
|
||||
cmp_x = cmp(x, cmp_x)
|
||||
cmp_y = cmp(y, cmp_y)
|
||||
return Resolution(cmp_x, cmp_y)
|
||||
|
||||
def take_max(
|
||||
self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
|
||||
) -> Resolution:
|
||||
"""Return a new Resolution object with the maximum resolution of inputs."""
|
||||
if yvals is not None:
|
||||
return Resolution(max(self.x, *vals), max(self.y, *yvals))
|
||||
max_x, max_y = self.x, self.y
|
||||
for x, y in vals:
|
||||
max_x = max(x, max_x)
|
||||
max_y = max(y, max_y)
|
||||
return Resolution(max_x, max_y)
|
||||
return self._take_minmax(vals, yvals, max)
|
||||
|
||||
def take_min(
|
||||
self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
|
||||
) -> Resolution:
|
||||
"""Return a new Resolution object with the minimum resolution of inputs."""
|
||||
return self._take_minmax(vals, yvals, min)
|
||||
|
||||
def flip_axis(self) -> Resolution[T]:
|
||||
"""Return a new Resolution object with x and y swapped."""
|
||||
@@ -95,11 +125,11 @@ class Resolution(Generic[T]):
|
||||
|
||||
def __str__(self):
|
||||
"""Return a string representation of the resolution."""
|
||||
return f"{self.x:f}x{self.y:f}"
|
||||
return f"{self.x:f}×{self.y:f}"
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
"""Return a repr() of the resolution."""
|
||||
return f"Resolution({self.x}x{self.y} dpi)"
|
||||
return f"Resolution({self.x!r}, {self.y!r})"
|
||||
|
||||
def __eq__(self, other):
|
||||
"""Return True if the resolution is equal to another resolution."""
|
||||
|
||||
@@ -420,12 +420,12 @@ class ImageInfo:
|
||||
return self._type
|
||||
|
||||
@property
|
||||
def width(self):
|
||||
def width(self) -> int:
|
||||
"""Width of the image in pixels."""
|
||||
return self._width
|
||||
|
||||
@property
|
||||
def height(self):
|
||||
def height(self) -> int:
|
||||
"""Height of the image in pixels."""
|
||||
return self._height
|
||||
|
||||
@@ -458,17 +458,24 @@ class ImageInfo:
|
||||
return self.dpi.is_finite and self.width >= 0 and self.height >= 0
|
||||
|
||||
@property
|
||||
def dpi(self):
|
||||
def dpi(self) -> Resolution:
|
||||
"""Dots per inch of the image.
|
||||
|
||||
Calculated based on where and how the image is drawn in the PDF.
|
||||
"""
|
||||
return _get_dpi(self._shorthand, (self._width, self._height))
|
||||
|
||||
@property
|
||||
def printed_area(self) -> float:
|
||||
"""Physical area of the image in square inches."""
|
||||
if not self.renderable:
|
||||
return 0.0
|
||||
return float(self.width * self.dpi.x * self.height * self.dpi.y)
|
||||
|
||||
def __repr__(self):
|
||||
"""Return a string representation of the image."""
|
||||
return (
|
||||
f"<ImageInfo '{self.name}' {self.type_} {self.width}x{self.height} "
|
||||
f"<ImageInfo '{self.name}' {self.type_} {self.width}×{self.height} "
|
||||
f"{self.color} {self.comp} {self.bpc} {self.enc} {self.dpi}>"
|
||||
)
|
||||
|
||||
@@ -747,12 +754,38 @@ def _pdf_pageinfo_concurrent(
|
||||
return pages
|
||||
|
||||
|
||||
class PageResolutionProfile(NamedTuple):
|
||||
"""Information about the resolutions of a page."""
|
||||
|
||||
weighted_dpi: float
|
||||
"""The weighted average DPI of the page, weighted by the area of each image."""
|
||||
|
||||
max_dpi: float
|
||||
"""The maximum DPI of an image on the page."""
|
||||
|
||||
average_to_max_dpi_ratio: float
|
||||
"""The average DPI of the page divided by the maximum DPI of the page.
|
||||
|
||||
This indicates the intensity of the resolution variation on the page.
|
||||
|
||||
If the average is 1.0 or close to 1.0, has all of its content at a uniform
|
||||
resolution. If the average is much lower than 1.0, some content is at a
|
||||
higher resolution than the rest of the page.
|
||||
"""
|
||||
|
||||
area_ratio: float
|
||||
"""The maximum-DPI area of the page divided by the total drawn area.
|
||||
|
||||
This indicates the prevalence of high-resolution content on the page.
|
||||
"""
|
||||
|
||||
|
||||
class PageInfo:
|
||||
"""Information about type of contents on each page in a PDF."""
|
||||
|
||||
_has_text: bool | None
|
||||
_has_vector: bool | None
|
||||
_images: list[ImageInfo]
|
||||
_images: list[ImageInfo] = []
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -939,6 +972,44 @@ class PageInfo:
|
||||
else:
|
||||
return '1.5'
|
||||
|
||||
def page_dpi_profile(self) -> PageResolutionProfile | None:
|
||||
"""Return information about the DPIs of the page.
|
||||
|
||||
This is useful to detect pages with a small proportion of high-resolution
|
||||
content that is forcing us to use a high DPI for the whole page. The ratio
|
||||
is weighted by the area of each image. If images overlap, the overlapped
|
||||
area counts.
|
||||
|
||||
Vector graphics and text are ignored.
|
||||
|
||||
Returns None if there is no meaningful DPI for the page.
|
||||
"""
|
||||
image_dpis = [
|
||||
image.dpi.to_scalar() for image in self._images if image.renderable
|
||||
]
|
||||
image_areas = [image.printed_area for image in self._images if image.renderable]
|
||||
total_drawn_area = sum(image_areas)
|
||||
if total_drawn_area == 0:
|
||||
return None
|
||||
|
||||
weights = [area / total_drawn_area for area in image_areas]
|
||||
# Calculate harmonic mean of DPIs weighted by area
|
||||
# When the minimum version is Python 3.10, change this to
|
||||
# statistics.harmonic_mean with the weights parameter
|
||||
# rather than doing it manually.
|
||||
weighted_dpi = sum(weights) / sum(
|
||||
weight / dpi for weight, dpi in zip(weights, image_dpis)
|
||||
)
|
||||
max_dpi = max(image_dpis)
|
||||
dpi_average_max_ratio = weighted_dpi / max_dpi
|
||||
|
||||
arg_max_dpi = image_dpis.index(max_dpi)
|
||||
max_area_ratio = image_areas[arg_max_dpi] / total_drawn_area
|
||||
|
||||
return PageResolutionProfile(
|
||||
weighted_dpi, max_dpi, dpi_average_max_ratio, max_area_ratio
|
||||
)
|
||||
|
||||
def __repr__(self):
|
||||
"""Return string representation."""
|
||||
return (
|
||||
|
||||
@@ -351,7 +351,6 @@ def filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) -
|
||||
This hook will be called from child processes. Modifying global state
|
||||
will not affect the main process or other child processes.
|
||||
|
||||
|
||||
Note:
|
||||
This is a :ref:`firstresult hook<firstresult>`.
|
||||
"""
|
||||
@@ -466,7 +465,7 @@ def generate_pdfa(
|
||||
pdf_pages: list[Path],
|
||||
pdfmark: Path,
|
||||
output_file: Path,
|
||||
compression: str,
|
||||
context: PdfContext,
|
||||
pdf_version: str,
|
||||
pdfa_part: str,
|
||||
progressbar_class,
|
||||
@@ -484,11 +483,7 @@ def generate_pdfa(
|
||||
pdfmark: A PostScript file intended for Ghostscript with details on
|
||||
how to perform the PDF/A conversion.
|
||||
output_file: The name of the desired output file.
|
||||
compression: One of ``'jpeg'``, ``'lossless'``, ``''``. For ``'jpeg'``,
|
||||
the PDF/A generator should convert all images to JPEG encoding where
|
||||
possible. For lossless, all images should be converted to FlateEncode
|
||||
(lossless PNG). If an empty string, the PDF generator should make its
|
||||
own decisions about how to encode images.
|
||||
context: The current context.
|
||||
pdf_version: The minimum PDF version that the output file should be.
|
||||
At its own discretion, the PDF/A generator may raise the version,
|
||||
but should not lower it.
|
||||
@@ -514,6 +509,11 @@ def generate_pdfa(
|
||||
Note:
|
||||
This is a :ref:`firstresult hook<firstresult>`.
|
||||
|
||||
Note:
|
||||
Before version 15.0.0, the ``context`` was not provided and ``compression``
|
||||
was provided instead. Plugins should now read the context object to determine
|
||||
if compression is requested.
|
||||
|
||||
See Also:
|
||||
https://github.com/tqdm/tqdm
|
||||
"""
|
||||
|
||||
@@ -20,14 +20,14 @@ def run_append_stderr(*args, **kwargs):
|
||||
|
||||
|
||||
@hookimpl
|
||||
def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
|
||||
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
|
||||
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
|
||||
mock.side_effect = run_append_stderr
|
||||
ghostscript.generate_pdfa(
|
||||
pdf_pages=pdf_pages,
|
||||
pdfmark=pdfmark,
|
||||
output_file=output_file,
|
||||
compression=compression,
|
||||
context=context,
|
||||
pdf_version=pdf_version,
|
||||
pdfa_part=pdfa_part,
|
||||
progressbar_class=None,
|
||||
|
||||
@@ -22,14 +22,14 @@ def run_rig_args(args, **kwargs):
|
||||
|
||||
|
||||
@hookimpl
|
||||
def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
|
||||
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
|
||||
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
|
||||
mock.side_effect = run_rig_args
|
||||
ghostscript.generate_pdfa(
|
||||
pdf_pages=pdf_pages,
|
||||
pdfmark=pdfmark,
|
||||
output_file=output_file,
|
||||
compression=compression,
|
||||
context=context,
|
||||
pdf_version=pdf_version,
|
||||
pdfa_part=pdfa_part,
|
||||
progressbar_class=None,
|
||||
|
||||
@@ -17,14 +17,14 @@ def raise_gs_fail(*args, **kwargs):
|
||||
|
||||
|
||||
@hookimpl
|
||||
def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
|
||||
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
|
||||
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
|
||||
mock.side_effect = raise_gs_fail
|
||||
ghostscript.generate_pdfa(
|
||||
pdf_pages=pdf_pages,
|
||||
pdfmark=pdfmark,
|
||||
output_file=output_file,
|
||||
compression=compression,
|
||||
context=context,
|
||||
pdf_version=pdf_version,
|
||||
pdfa_part=pdfa_part,
|
||||
progressbar_class=None,
|
||||
|
||||
@@ -23,7 +23,7 @@ def generate_pdfa(
|
||||
pdf_pages,
|
||||
pdfmark,
|
||||
output_file,
|
||||
compression,
|
||||
context,
|
||||
pdf_version,
|
||||
pdfa_part,
|
||||
stop_on_soft_error,
|
||||
@@ -34,7 +34,7 @@ def generate_pdfa(
|
||||
pdf_pages=pdf_pages,
|
||||
pdfmark=pdfmark,
|
||||
output_file=output_file,
|
||||
compression=compression,
|
||||
context=context,
|
||||
pdf_version=pdf_version,
|
||||
pdfa_part=pdfa_part,
|
||||
progressbar_class=None,
|
||||
|
||||
@@ -86,7 +86,8 @@ def test_unset_metadata(output_type, field, resources, outpdf):
|
||||
'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd',
|
||||
'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp',
|
||||
'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr',
|
||||
'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh'}
|
||||
'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh',
|
||||
}
|
||||
|
||||
p = run_ocrmypdf(
|
||||
input_file,
|
||||
@@ -352,17 +353,24 @@ XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
|
||||
|
||||
def test_prevent_gs_invalid_xml(resources, outdir):
|
||||
generate_pdfa_ps(outdir / 'pdfa.ps')
|
||||
copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')
|
||||
|
||||
# Inject a string with a trailing nul character into the DocumentInfo
|
||||
# dictionary of this PDF, as often occurs in practice.
|
||||
with pikepdf.open(outdir / 'layers.rendered.pdf') as pike:
|
||||
with pikepdf.open(resources / 'trivial.pdf') as pike:
|
||||
pike.Root.DocumentInfo = pikepdf.Dictionary(
|
||||
Title=b'String with trailing nul\x00'
|
||||
)
|
||||
pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)
|
||||
|
||||
options = get_parser().parse_args(
|
||||
args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
|
||||
_, options, _ = get_parser_options_plugins(
|
||||
args=[
|
||||
'-j',
|
||||
'1',
|
||||
'--output-type',
|
||||
'pdfa-2',
|
||||
'a.pdf',
|
||||
'b.pdf',
|
||||
]
|
||||
)
|
||||
pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
|
||||
context = PdfContext(
|
||||
@@ -387,17 +395,15 @@ def test_prevent_gs_invalid_xml(resources, outdir):
|
||||
|
||||
def test_malformed_docinfo(caplog, resources, outdir):
|
||||
generate_pdfa_ps(outdir / 'pdfa.ps')
|
||||
# copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')
|
||||
|
||||
with pikepdf.open(resources / 'trivial.pdf') as pike:
|
||||
pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
|
||||
pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)
|
||||
|
||||
options = get_parser().parse_args(
|
||||
_, options, _ = get_parser_options_plugins(
|
||||
args=[
|
||||
'-j',
|
||||
'1',
|
||||
'--continue-on-soft-render-error',
|
||||
'--output-type',
|
||||
'pdfa-2',
|
||||
'a.pdf',
|
||||
|
||||
Reference in New Issue
Block a user