Merge branch 'feature/fix-raster-dpi-too-high' into v15

This commit is contained in:
James R. Barlow
2023-09-21 00:05:56 -07:00
17 changed files with 366 additions and 121 deletions

View File

@@ -114,6 +114,41 @@ exceed a certain number of megapixels with ``--skip-big``. (A 300 DPI,
# Allow 300 seconds for OCR; skip any page larger than 50 megapixels
ocrmypdf --tesseract-timeout 300 --skip-big 50 bigfile.pdf output.pdf
OCR for huge images
-------------------
Separate from these settings, Tesseract has internal limits on the size
of images it will process. If you issue
``--tesseract-downsample-large-images``, OCRmyPDF will downsample images
to fit Tesseract limits. (The limits are usually entered only for scanned
images of oversized media, such as large maps or blueprints exceeding
110 cm or 43 inches in either dimension, and at high DPI.)
``--tesseract-downsample-above`` adjusts the threshold at which images
will be downsampled. By default, only images that exceed any of Tesseract's
internal limits are downsampled.
You will also need to set ``--tesseract-timeout`` high enough to allow
for processing.
Only the image sent for OCR is downsampled. The original image is
preserved.
.. code-block:: bash
# Allow 600 seconds for OCR on huge images
ocrmypdf --tesseract-timeout 600 \
--tesseract-downsample-large-images \
bigfile.pdf output.pdf
# Downsample images above 5000 pixels on the longest dimension to
# 5000 pixels
ocrmypdf --tesseract-timeout 120 \
--tesseract-downsample-large-images \
--tesseract-downsample-above 5000 \
bigfile.pdf output_downsampled_ocr.pdf
Overriding default tesseract
----------------------------

View File

@@ -6,53 +6,55 @@ set -o errexit
__ocrmypdf_arguments()
{
local arguments="--help (show help message)
--language (language(s) of the file to be OCRed)
--image-dpi (assume this DPI if input image DPI is unknown)
--output-type (select PDF output options)
--sidecar (write OCR to text file)
--version (print program version and exit)
--jobs (how many worker processes to use)
--quiet (suppress INFO messages)
--verbose (set verbosity level)
--title (set metadata)
--author (set metadata)
--subject (set metadata)
--keywords (set metadata)
--rotate-pages (rotate pages to correct orientation)
--remove-background (attempt to remove background from pages)
--deskew (fix small horizontal alignment skew)
--clean (clean document images before OCR)
--clean-final (clean document images and keep result)
--unpaper-args (a quoted string of arguments to pass to unpaper)
--oversample (oversample images to this DPI)
--remove-vectors (don\'t send vector objects to OCR)
--threshold (threshold images before OCR)
--force-ocr (OCR documents that already have printable text)
--skip-text (skip OCR on any pages that already contain text)
--redo-ocr (redo OCR on any pages that seem to have OCR already)
local arguments="\
--help (show help message)
--language (language(s) of the file to be OCRed)
--image-dpi (assume this DPI if input image DPI is unknown)
--output-type (select PDF output options)
--sidecar (write OCR to text file)
--version (print program version and exit)
--jobs (how many worker processes to use)
--quiet (suppress INFO messages)
--verbose (set verbosity level)
--title (set metadata)
--author (set metadata)
--subject (set metadata)
--keywords (set metadata)
--rotate-pages (rotate pages to correct orientation)
--remove-background (attempt to remove background from pages)
--deskew (fix small horizontal alignment skew)
--clean (clean document images before OCR)
--clean-final (clean document images and keep result)
--unpaper-args (a quoted string of arguments to pass to unpaper)
--oversample (oversample images to this DPI)
--remove-vectors (don\'t send vector objects to OCR)
--threshold (threshold images before OCR)
--force-ocr (OCR documents that already have printable text)
--skip-text (skip OCR on any pages that already contain text)
--redo-ocr (redo OCR on any pages that seem to have OCR already)
--invalidate-digital-signatures (remove digital signatures from PDF)
--skip-big (skip OCR on pages larger than this many MPixels)
--optimize (select optimization level)
--jpeg-quality (JPEG quality [0..100])
--png-quality (PNG quality [0..100])
--jbig2-lossy (enable lossy JBIG2 (see docs))
--pages (apply OCR to only the specified pages)
--max-image-mpixels (image decompression bomb threshold)
--pdf-renderer (select PDF renderer options)
--rotate-pages-threshold (page rotation confidence)
--pdfa-image-compression (set PDF/A image compression options)
--fast-web-view (if file size if above this amount in MB linearize PDF)
--plugin (name of plugin to import)
--keep-temporary-files (keep temporary files (debug)
--tesseract-config (set custom tesseract config file)
--tesseract-pagesegmode (set tesseract --psm)
--tesseract-oem (set tesseract --oem)
--tesseract-thresholding (set tesseract image thresholding)
--tesseract-timeout (maximum number of seconds to wait for OCR)
--user-words (specify location of user words file)
--user-patterns (specify location of user patterns file)
--no-progress-bar (disable the progress bar)
--skip-big (skip OCR on pages larger than this many MPixels)
--optimize (select optimization level)
--jpeg-quality (JPEG quality [0..100])
--png-quality (PNG quality [0..100])
--jbig2-lossy (enable lossy JBIG2 (see docs))
--pages (apply OCR to only the specified pages)
--max-image-mpixels (image decompression bomb threshold)
--pdf-renderer (select PDF renderer options)
--rotate-pages-threshold (page rotation confidence)
--pdfa-image-compression (set PDF/A image compression options)
--fast-web-view (if file size if above this amount in MB linearize PDF)
--plugin (name of plugin to import)
--keep-temporary-files (keep temporary files (debug)
--tesseract-config (set custom tesseract config file)
--tesseract-pagesegmode (set tesseract --psm)
--tesseract-oem (set tesseract --oem)
--tesseract-thresholding (set tesseract image thresholding)
--tesseract-timeout (maximum number of seconds to wait for OCR)
--user-words (specify location of user words file)
--user-patterns (specify location of user patterns file)
--no-progress-bar (disable the progress bar)
--color-conversion-strategy (select color conversion strategy)
"
COMPREPLY=( $( compgen -W "$arguments" -- "$cur") )
@@ -192,6 +194,20 @@ sauvola (use Sauvola thresholding)"
fi
}
__ocrmypdf_color-conversion-strategy()
{
local choices="LeaveColorUnchanged (default)
CMYK (convert to CMYK)
Gray (convert to grayscale)
RGB (convert to RGB)
UseDeviceIndependentColor (convert with device independent color)"
COMPREPLY=( $( compgen -W "$choices" -- "$cur") )
# Remove description if only one completion exists
if [[ ${#COMPREPLY[*]} -eq 1 ]]; then
COMPREPLY=( ${COMPREPLY[0]%% *} )
fi
}
__ocrmypdf_check_previous()
{
@@ -251,6 +267,10 @@ __ocrmypdf_check_previous()
_filedir
return 0
;;
--color-conversion-strategy)
__ocrmypdf_color-conversion-strategy
return 0
;;
esac
return 1

View File

@@ -129,4 +129,27 @@ complete -c ocrmypdf -r -l user-words -d "specify location of user words file"
complete -c ocrmypdf -r -l user-patterns -d "specify location of user patterns file"
complete -c ocrmypdf -x -l fast-web-view -d "if file size if above this amount in MB, linearize PDF"
complete -c ocrmypdf -x -a "(__fish_complete_suffix .pdf; __fish_complete_suffix .PDF; __fish_complete_suffix .jpg; __fish_complete_suffix .png)"
function __fish_ocrmypdf_color_conversion_strategy
echo -e "LeaveColorUnchanged\t"(_ "do not convert color spaces (default)")
echo -e "CMYK\t"(_ "convert all color spaces to CMYK")
echo -e "Gray\t"(_ "convert all color spaces to grayscale")
echo -e "RGB\t"(_ "convert all color spaces to RGB")
echo -e "UseDeviceIndependentColor\t"(_ "convert all color spaces to ICC-based color spaces")
end
complete -c ocrmypdf -x -l color-conversion-strategy -a '(__fish_ocrmypdf_color_conversion_strategy)' -d "set color conversion strategy"
function __fish_ocrmypdf_input_file_given
set -l tokens (commandline -opc)
for token in $tokens
if string match -q -r '^-' -- $token
continue
end
if test -f "$token"
return 0
end
end
return 1
end
complete -c ocrmypdf -x -n 'not __fish_ocrmypdf_input_file_given' -a "(__fish_complete_suffix .pdf)" -d "input file"

View File

@@ -28,6 +28,17 @@ except AttributeError:
# Pillow 9 shim
Transpose = Image # type: ignore
COLOR_CONVERSION_STRATEGIES = frozenset(
[
'CMYK',
'Gray',
'LeaveColorUnchanged',
'RGB',
'UseDeviceIndependentColor',
]
)
log = logging.getLogger(__name__)
@@ -151,6 +162,7 @@ def generate_pdfa(
output_file: os.PathLike,
*,
compression: str,
color_conversion_strategy: str,
pdf_version: str = '1.5',
pdfa_part: str = '2',
progressbar_class=None,
@@ -200,16 +212,16 @@ def generate_pdfa(
"-dBATCH",
"-dNOPAUSE",
"-dSAFER",
"-dCompatibilityLevel=" + str(pdf_version),
f"-dCompatibilityLevel={str(pdf_version)}",
"-sDEVICE=pdfwrite",
"-dAutoRotatePages=/None",
"-sColorConversionStrategy=" + strategy,
f"-sColorConversionStrategy={color_conversion_strategy}",
]
+ (['-dPDFSTOPONERROR'] if stop_on_error else [])
+ compression_args
+ [
"-dJPEGQ=95",
"-dPDFA=" + pdfa_part,
f"-dPDFA={pdfa_part}",
"-dPDFACompatibilityPolicy=1",
"-o",
"-",

View File

@@ -359,8 +359,12 @@ def is_ocr_required(page_context: PageContext) -> bool:
def rasterize_preview(input_file: Path, page_context: PageContext) -> Path:
"""Generate a lower quality preview image."""
output_file = page_context.get_path('rasterize_preview.jpg')
canvas_dpi = get_canvas_square_dpi(page_context.pageinfo, page_context.options)
page_dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
canvas_dpi = Resolution(300.0, 300.0).take_min(
[get_canvas_square_dpi(page_context.pageinfo, page_context.options)]
)
page_dpi = Resolution(300.0, 300.0).take_min(
[get_page_square_dpi(page_context.pageinfo, page_context.options)]
)
page_context.plugin_manager.hook.rasterize_pdf_page(
input_file=input_file,
output_file=output_file,
@@ -490,6 +494,21 @@ def rasterize(
canvas_dpi = get_canvas_square_dpi(pageinfo, page_context.options)
page_dpi = get_page_square_dpi(pageinfo, page_context.options)
dpi_profile = pageinfo.page_dpi_profile()
if dpi_profile and dpi_profile.average_to_max_dpi_ratio < 0.8:
log.warning(
"Weight average DPI is %0.1f, max DPI is %0.1f. "
"The discrepancy may indicate a high detail region on this page, "
"but could also indicate a problem with the input PDF file. "
"An image will be rendered at %0.1f DPI.",
dpi_profile.weighted_dpi,
dpi_profile.max_dpi,
dpi_profile.weighted_dpi,
)
canvas_dpi = page_dpi = Resolution(
dpi_profile.weighted_dpi, dpi_profile.weighted_dpi
)
page_context.plugin_manager.hook.rasterize_pdf_page(
input_file=input_file,
output_file=output_file,
@@ -792,7 +811,7 @@ def convert_to_pdfa(input_pdf: Path, input_ps_stub: Path, context: PdfContext) -
pdf_pages=[fix_docinfo_file],
pdfmark=input_ps_stub,
output_file=output_file,
compression=options.pdfa_image_compression,
context=context,
pdfa_part=options.output_type[-1], # is pdfa-1, pdfa-2, or pdfa-3
progressbar_class=(
context.plugin_manager.hook.get_progressbar_class()

View File

@@ -205,16 +205,6 @@ def check_options_ocr_behavior(options: Namespace) -> None:
options.pages = _pages_from_ranges(options.pages)
def check_options_advanced(options: Namespace) -> None:
if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
'pdfa'
):
log.warning(
"--pdfa-image-compression argument only applies when "
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
)
def check_options_metadata(options: Namespace) -> None:
docinfo = [options.title, options.author, options.keywords, options.subject]
for s in (m for m in docinfo if m):
@@ -241,7 +231,6 @@ def _check_plugin_invariant_options(options: Namespace) -> None:
check_options_sidecar(options)
check_options_preprocessing(options)
check_options_ocr_behavior(options)
check_options_advanced(options)
check_options_pillow(options)

View File

@@ -18,6 +18,33 @@ log = logging.getLogger(__name__)
BLACKLISTED_GS_VERSIONS = frozenset()
@hookimpl
def add_options(parser):
gs = parser.add_argument_group("Ghostscript", "Advanced control of Ghostscript")
gs.add_argument(
'--color-conversion-strategy',
action='store',
type=str,
metavar='STRATEGY',
choices=ghostscript.COLOR_CONVERSION_STRATEGIES,
default='LeaveColorUnchanged',
help="Set Ghostscript color conversion strategy",
)
gs.add_argument(
'--pdfa-image-compression',
choices=['auto', 'jpeg', 'lossless'],
default='auto',
help="Specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.",
)
@hookimpl
def check_options(options):
"""Check that the options are valid for this plugin."""
@@ -37,6 +64,17 @@ def check_options(options):
if options.output_type == 'pdfa':
options.output_type = 'pdfa-2'
if options.color_conversion_strategy not in ghostscript.COLOR_CONVERSION_STRATEGIES:
raise ValueError(
f"Invalid color conversion strategy: {options.color_conversion_strategy}"
)
if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
'pdfa'
):
log.warning(
"--pdfa-image-compression argument only applies when "
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
)
@hookimpl
@@ -71,7 +109,7 @@ def generate_pdfa(
pdf_pages,
pdfmark,
output_file,
compression,
context,
pdf_version,
pdfa_part,
progressbar_class,
@@ -81,7 +119,8 @@ def generate_pdfa(
ghostscript.generate_pdfa(
pdf_pages=[*pdf_pages, pdfmark],
output_file=output_file,
compression=compression,
compression=context.options.pdfa_image_compression,
color_conversion_strategy=context.options.color_conversion_strategy,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=progressbar_class,

View File

@@ -30,7 +30,7 @@ def add_options(parser):
action='append',
metavar='CFG',
default=[],
help="Additional Tesseract configuration files -- see documentation",
help="Additional Tesseract configuration files -- see documentation.",
)
tess.add_argument(
'--tesseract-pagesegmode',
@@ -38,7 +38,7 @@ def add_options(parser):
type=int,
metavar='PSM',
choices=range(0, 14),
help="Set Tesseract page segmentation mode (see tesseract --help)",
help="Set Tesseract page segmentation mode (see tesseract --help).",
)
tess.add_argument(
'--tesseract-oem',
@@ -75,7 +75,10 @@ def add_options(parser):
metavar='SECONDS',
help=(
"Give up on OCR after the timeout, but copy the preprocessed page "
"into the final output."
"into the final output. This timeout is only used when using Tesseract "
"for OCR. When Tesseract is used for other operations such as "
"deskewing and orientation, the timeout is controlled by "
"--tesseract-non-ocr-timeout."
),
)
tess.add_argument(
@@ -175,6 +178,15 @@ def validate(pdfinfo, options):
tess_threads = int(os.environ['OMP_THREAD_LIMIT'])
log.debug("Using Tesseract OpenMP thread limit %d", tess_threads)
if (
options.tesseract_downsample_above != 32767
and not options.tesseract_downsample_large_images
):
log.warning(
"The --tesseract-downsample-above argument will have no effect unless "
"--tesseract-downsample-large-images is also given."
)
@hookimpl
def filter_ocr_image(page: PageContext, image: Image.Image) -> Image.Image:

View File

@@ -177,7 +177,9 @@ Online documentation is located at:
'--image-dpi',
metavar='DPI',
type=int,
help="For input image instead of PDF, use this DPI instead of file's.",
help="When the input file is an image, not a PDF, use this DPI instead "
"of the DPI claimed by the input file. If the input does not claim a "
"sensible DPI, this option will be required.",
)
parser.add_argument(
'--output-type',
@@ -402,19 +404,6 @@ Online documentation is located at:
help="Only rotate pages when confidence is above this value (arbitrary "
"units reported by tesseract)",
)
advanced.add_argument(
'--pdfa-image-compression',
choices=['auto', 'jpeg', 'lossless'],
default='auto',
help="Specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.",
)
advanced.add_argument(
'--fast-web-view',
type=numeric(float, 0),

View File

@@ -15,7 +15,16 @@ from contextlib import suppress
from io import StringIO
from math import isclose, isfinite
from pathlib import Path
from typing import Any, Generic, Sequence, SupportsFloat, SupportsRound, TypeVar
from statistics import harmonic_mean
from typing import (
Any,
Callable,
Generic,
Sequence,
SupportsFloat,
SupportsRound,
TypeVar,
)
import img2pdf
import pikepdf
@@ -73,17 +82,38 @@ class Resolution(Generic[T]):
return isfinite(self.x) and isfinite(self.y)
return True
def to_scalar(self) -> float:
"""Return the harmonic mean of x and y as a 1D approximation.
In most cases, Resolution is 2D, but typically it is "square" (x == y) and
can be approximated as a single number. When not square, the harmonic mean
is used to approximate the 2D resolution as a single number.
"""
return harmonic_mean([self.x, self.y])
def _take_minmax(
self, vals: Iterable[Any], yvals: Iterable[Any], cmp: Callable
) -> Resolution:
"""Return a new Resolution object with the maximum resolution of inputs."""
if yvals is not None:
return Resolution(cmp(self.x, *vals), cmp(self.y, *yvals))
cmp_x, cmp_y = self.x, self.y
for x, y in vals:
cmp_x = cmp(x, cmp_x)
cmp_y = cmp(y, cmp_y)
return Resolution(cmp_x, cmp_y)
def take_max(
self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
) -> Resolution:
"""Return a new Resolution object with the maximum resolution of inputs."""
if yvals is not None:
return Resolution(max(self.x, *vals), max(self.y, *yvals))
max_x, max_y = self.x, self.y
for x, y in vals:
max_x = max(x, max_x)
max_y = max(y, max_y)
return Resolution(max_x, max_y)
return self._take_minmax(vals, yvals, max)
def take_min(
self, vals: Iterable[Any], yvals: Iterable[Any] | None = None
) -> Resolution:
"""Return a new Resolution object with the minimum resolution of inputs."""
return self._take_minmax(vals, yvals, min)
def flip_axis(self) -> Resolution[T]:
"""Return a new Resolution object with x and y swapped."""
@@ -95,11 +125,11 @@ class Resolution(Generic[T]):
def __str__(self):
"""Return a string representation of the resolution."""
return f"{self.x:f}x{self.y:f}"
return f"{self.x:f}×{self.y:f}"
def __repr__(self): # pragma: no cover
"""Return a repr() of the resolution."""
return f"Resolution({self.x}x{self.y} dpi)"
return f"Resolution({self.x!r}, {self.y!r})"
def __eq__(self, other):
"""Return True if the resolution is equal to another resolution."""

View File

@@ -420,12 +420,12 @@ class ImageInfo:
return self._type
@property
def width(self):
def width(self) -> int:
"""Width of the image in pixels."""
return self._width
@property
def height(self):
def height(self) -> int:
"""Height of the image in pixels."""
return self._height
@@ -458,17 +458,24 @@ class ImageInfo:
return self.dpi.is_finite and self.width >= 0 and self.height >= 0
@property
def dpi(self):
def dpi(self) -> Resolution:
"""Dots per inch of the image.
Calculated based on where and how the image is drawn in the PDF.
"""
return _get_dpi(self._shorthand, (self._width, self._height))
@property
def printed_area(self) -> float:
"""Physical area of the image in square inches."""
if not self.renderable:
return 0.0
return float(self.width * self.dpi.x * self.height * self.dpi.y)
def __repr__(self):
"""Return a string representation of the image."""
return (
f"<ImageInfo '{self.name}' {self.type_} {self.width}x{self.height} "
f"<ImageInfo '{self.name}' {self.type_} {self.width}×{self.height} "
f"{self.color} {self.comp} {self.bpc} {self.enc} {self.dpi}>"
)
@@ -747,12 +754,38 @@ def _pdf_pageinfo_concurrent(
return pages
class PageResolutionProfile(NamedTuple):
"""Information about the resolutions of a page."""
weighted_dpi: float
"""The weighted average DPI of the page, weighted by the area of each image."""
max_dpi: float
"""The maximum DPI of an image on the page."""
average_to_max_dpi_ratio: float
"""The average DPI of the page divided by the maximum DPI of the page.
This indicates the intensity of the resolution variation on the page.
If the average is 1.0 or close to 1.0, has all of its content at a uniform
resolution. If the average is much lower than 1.0, some content is at a
higher resolution than the rest of the page.
"""
area_ratio: float
"""The maximum-DPI area of the page divided by the total drawn area.
This indicates the prevalence of high-resolution content on the page.
"""
class PageInfo:
"""Information about type of contents on each page in a PDF."""
_has_text: bool | None
_has_vector: bool | None
_images: list[ImageInfo]
_images: list[ImageInfo] = []
def __init__(
self,
@@ -939,6 +972,44 @@ class PageInfo:
else:
return '1.5'
def page_dpi_profile(self) -> PageResolutionProfile | None:
"""Return information about the DPIs of the page.
This is useful to detect pages with a small proportion of high-resolution
content that is forcing us to use a high DPI for the whole page. The ratio
is weighted by the area of each image. If images overlap, the overlapped
area counts.
Vector graphics and text are ignored.
Returns None if there is no meaningful DPI for the page.
"""
image_dpis = [
image.dpi.to_scalar() for image in self._images if image.renderable
]
image_areas = [image.printed_area for image in self._images if image.renderable]
total_drawn_area = sum(image_areas)
if total_drawn_area == 0:
return None
weights = [area / total_drawn_area for area in image_areas]
# Calculate harmonic mean of DPIs weighted by area
# When the minimum version is Python 3.10, change this to
# statistics.harmonic_mean with the weights parameter
# rather than doing it manually.
weighted_dpi = sum(weights) / sum(
weight / dpi for weight, dpi in zip(weights, image_dpis)
)
max_dpi = max(image_dpis)
dpi_average_max_ratio = weighted_dpi / max_dpi
arg_max_dpi = image_dpis.index(max_dpi)
max_area_ratio = image_areas[arg_max_dpi] / total_drawn_area
return PageResolutionProfile(
weighted_dpi, max_dpi, dpi_average_max_ratio, max_area_ratio
)
def __repr__(self):
"""Return string representation."""
return (

View File

@@ -351,7 +351,6 @@ def filter_pdf_page(page: PageContext, image_filename: Path, output_pdf: Path) -
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
Note:
This is a :ref:`firstresult hook<firstresult>`.
"""
@@ -466,7 +465,7 @@ def generate_pdfa(
pdf_pages: list[Path],
pdfmark: Path,
output_file: Path,
compression: str,
context: PdfContext,
pdf_version: str,
pdfa_part: str,
progressbar_class,
@@ -484,11 +483,7 @@ def generate_pdfa(
pdfmark: A PostScript file intended for Ghostscript with details on
how to perform the PDF/A conversion.
output_file: The name of the desired output file.
compression: One of ``'jpeg'``, ``'lossless'``, ``''``. For ``'jpeg'``,
the PDF/A generator should convert all images to JPEG encoding where
possible. For lossless, all images should be converted to FlateEncode
(lossless PNG). If an empty string, the PDF generator should make its
own decisions about how to encode images.
context: The current context.
pdf_version: The minimum PDF version that the output file should be.
At its own discretion, the PDF/A generator may raise the version,
but should not lower it.
@@ -514,6 +509,11 @@ def generate_pdfa(
Note:
This is a :ref:`firstresult hook<firstresult>`.
Note:
Before version 15.0.0, the ``context`` was not provided and ``compression``
was provided instead. Plugins should now read the context object to determine
if compression is requested.
See Also:
https://github.com/tqdm/tqdm
"""

View File

@@ -20,14 +20,14 @@ def run_append_stderr(*args, **kwargs):
@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
mock.side_effect = run_append_stderr
ghostscript.generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
compression=compression,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=None,

View File

@@ -22,14 +22,14 @@ def run_rig_args(args, **kwargs):
@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
mock.side_effect = run_rig_args
ghostscript.generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
compression=compression,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=None,

View File

@@ -17,14 +17,14 @@ def raise_gs_fail(*args, **kwargs):
@hookimpl
def generate_pdfa(pdf_pages, pdfmark, output_file, compression, pdf_version, pdfa_part):
def generate_pdfa(pdf_pages, pdfmark, output_file, context, pdf_version, pdfa_part):
with patch('ocrmypdf._exec.ghostscript.run_polling_stderr') as mock:
mock.side_effect = raise_gs_fail
ghostscript.generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
compression=compression,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=None,

View File

@@ -23,7 +23,7 @@ def generate_pdfa(
pdf_pages,
pdfmark,
output_file,
compression,
context,
pdf_version,
pdfa_part,
stop_on_soft_error,
@@ -34,7 +34,7 @@ def generate_pdfa(
pdf_pages=pdf_pages,
pdfmark=pdfmark,
output_file=output_file,
compression=compression,
context=context,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=None,

View File

@@ -86,7 +86,8 @@ def test_unset_metadata(output_type, field, resources, outpdf):
'title': b'NFY5f7Ft2DWMkxLhXwxvFf7eWR2KeK3vEDcd',
'author': b'yXaryipxyRk9dVjWjSSaVaNCKeLRgEVzPRMp',
'subject': b't49vimctvnuH7ZeAjAkv52ACvWFjcnm5MPJr',
'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh'}
'keywords': b's9EeALwUg7urA7fnnhm5EtUyC54sW2WPUzqh',
}
p = run_ocrmypdf(
input_file,
@@ -352,17 +353,24 @@ XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
def test_prevent_gs_invalid_xml(resources, outdir):
generate_pdfa_ps(outdir / 'pdfa.ps')
copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')
# Inject a string with a trailing nul character into the DocumentInfo
# dictionary of this PDF, as often occurs in practice.
with pikepdf.open(outdir / 'layers.rendered.pdf') as pike:
with pikepdf.open(resources / 'trivial.pdf') as pike:
pike.Root.DocumentInfo = pikepdf.Dictionary(
Title=b'String with trailing nul\x00'
)
pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)
options = get_parser().parse_args(
args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
_, options, _ = get_parser_options_plugins(
args=[
'-j',
'1',
'--output-type',
'pdfa-2',
'a.pdf',
'b.pdf',
]
)
pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
context = PdfContext(
@@ -387,17 +395,15 @@ def test_prevent_gs_invalid_xml(resources, outdir):
def test_malformed_docinfo(caplog, resources, outdir):
generate_pdfa_ps(outdir / 'pdfa.ps')
# copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')
with pikepdf.open(resources / 'trivial.pdf') as pike:
pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)
options = get_parser().parse_args(
_, options, _ = get_parser_options_plugins(
args=[
'-j',
'1',
'--continue-on-soft-render-error',
'--output-type',
'pdfa-2',
'a.pdf',