Files
OCRmyPDF/ocrmypdf/exec/ghostscript.py

169 lines
5.7 KiB
Python

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83
from tempfile import NamedTemporaryFile
from subprocess import run, PIPE, STDOUT, CalledProcessError
from shutil import copy
from functools import lru_cache
import re
import sys
from . import get_program
from ..exceptions import SubprocessOutputError
from PIL import Image
from ..helpers import fspath
@lru_cache(maxsize=1)
def version():
args_gs = [
get_program('gs'),
'--version'
]
try:
version = check_output(
args_gs, close_fds=True, universal_newlines=True,
stderr=STDOUT)
except CalledProcessError as e:
print("Could not find Ghostscript executable on system PATH.",
file=sys.stderr)
raise MissingDependencyError from e
return version.strip()
def _gs_error_reported(stream):
return re.search(r'error', stream, flags=re.IGNORECASE)
def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
pageno=1, page_dpi=None):
"""
Rasterize one page of a PDF at resolution (xres, yres) in canvas units.
The image is sized to match the integer pixels dimensions implied by
(xres, yres) even if those numbers are noninteger. The image's DPI will
be overridden with the values in page_dpi.
:param input_file: pathlike
:param output_file: pathlike
:param xres: resolution at which to rasterize page
:param yres:
:param raster_device:
:param log:
:param pageno: page number to rasterize
:param page_dpi: resolution tuple (x, y) overriding output image DPI
:return:
"""
res = xres, yres
int_res = round(xres), round(yres)
if not page_dpi:
page_dpi = res
with NamedTemporaryFile(delete=True) as tmp:
args_gs = [
get_program('gs'),
'-dQUIET',
'-dSAFER',
'-dBATCH',
'-dNOPAUSE',
'-sDEVICE=%s' % raster_device,
'-dFirstPage=%i' % pageno,
'-dLastPage=%i' % pageno,
'-o', tmp.name,
'-r{0}x{1}'.format(str(int_res[0]), str(int_res[1])),
fspath(input_file)
]
p = run(args_gs, stdout=PIPE, stderr=STDOUT,
universal_newlines=True)
if _gs_error_reported(p.stdout):
log.error(p.stdout)
else:
log.debug(p.stdout)
if p.returncode != 0:
log.error('Ghostscript rasterizing failed')
raise SubprocessOutputError()
# Ghostscript only accepts integers for output resolution
# if the resolution happens to be fractional, then the discrepancy
# would change the size of the output page, especially if the DPI
# is quite low. Resize the image to the expected size
tmp.seek(0)
with Image.open(tmp) as im:
expected_size = round(im.size[0] / int_res[0] * res[0]), \
round(im.size[1] / int_res[1] * res[1])
if expected_size != im.size or page_dpi != (xres, yres):
log.debug(
"Ghostscript: resize output image {} -> {}".format(
im.size, expected_size))
im.resize(expected_size).save(
fspath(output_file), dpi=page_dpi)
else:
copy(tmp.name, fspath(output_file))
def generate_pdfa(pdf_pages, output_file, compression, log,
threads=1, pdf_version='1.5'):
compression_args = []
if compression == 'jpeg':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/DCTEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/DCTEncode",
]
elif compression == 'lossless':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/FlateEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/FlateEncode",
]
else:
compression_args = [
"-dAutoFilterColorImages=true",
"-dAutoFilterGrayImages=true",
]
with NamedTemporaryFile(delete=True) as gs_pdf:
args_gs = [
get_program("gs"),
"-dQUIET",
"-dBATCH",
"-dNOPAUSE",
"-dCompatibilityLevel=" + str(pdf_version),
"-dNumRenderingThreads=" + str(threads),
"-sDEVICE=pdfwrite",
"-dAutoRotatePages=/None",
"-sColorConversionStrategy=/RGB",
"-sProcessColorModel=DeviceRGB"
] + compression_args + [
"-dJPEGQ=95",
"-dPDFA=2",
"-dPDFACompatibilityPolicy=1",
"-sOutputFile=" + gs_pdf.name,
]
args_gs.extend(pdf_pages)
p = run(args_gs, stdout=PIPE, stderr=STDOUT,
universal_newlines=True)
if _gs_error_reported(p.stdout):
log.error(p.stdout)
elif 'overprint mode not set' in p.stdout:
# Unless someone is going to print PDF/A documents on a
# magical sRGB printer I can't see the removal of overprinting
# being a problem....
log.debug(
"Ghostscript had to remove PDF 'overprinting' from the "
"input file to complete PDF/A conversion. "
)
else:
log.debug(p.stdout)
if p.returncode == 0:
# Ghostscript does not change return code when it fails to create
# PDF/A - check PDF/A status elsewhere
copy(gs_pdf.name, output_file)
else:
log.error('Ghostscript PDF/A rendering failed')
raise SubprocessOutputError()