OCRmyPDF/src/ocrmypdf/exec/ghostscript.py

# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

from tempfile import NamedTemporaryFile
from subprocess import run, PIPE, STDOUT, CalledProcessError
from shutil import copy
from functools import lru_cache
import re
import sys
from PIL import Image
from . import get_version
from ..exceptions import SubprocessOutputError, MissingDependencyError
from ..helpers import fspath


@lru_cache(maxsize=1)
def version():
    return get_version('gs')


def jpeg_passthrough_available():
    """
    Ghostscript 9.23 introduced JPEG passthrough but it seems to corrupt the
    last two bytes of certain images, for now we disable it for 9.23 and
    do not mention it for < 9.23.

    https://bugs.ghostscript.com/show_bug.cgi?id=699216

    """
    return False


def _gs_error_reported(stream):
    return re.search(r'error', stream, flags=re.IGNORECASE)


def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
                  pageno=1, page_dpi=None):
    """
    Rasterize one page of a PDF at resolution (xres, yres) in canvas units.

    The image is sized to match the integer pixels dimensions implied by
    (xres, yres) even if those numbers are noninteger. The image's DPI will
     be overridden with the values in page_dpi.

    :param input_file: pathlike
    :param output_file: pathlike
    :param xres: resolution at which to rasterize page
    :param yres:
    :param raster_device:
    :param log:
    :param pageno: page number to rasterize (beginning at page 1)
    :param page_dpi: resolution tuple (x, y) overriding output image DPI
    :return:
    """
    res = xres, yres
    int_res = round(xres), round(yres)
    if not page_dpi:
        page_dpi = res
    with NamedTemporaryFile(delete=True) as tmp:
        args_gs = [
            'gs',
            '-dQUIET',
            '-dSAFER',
            '-dBATCH',
            '-dNOPAUSE',
            '-sDEVICE=%s' % raster_device,
            '-dFirstPage=%i' % pageno,
            '-dLastPage=%i' % pageno,
            '-r{0}x{1}'.format(str(int_res[0]), str(int_res[1])),
            '-o', tmp.name,
            fspath(input_file)
        ]

        p = run(args_gs, stdout=PIPE, stderr=STDOUT,
                universal_newlines=True)
        if _gs_error_reported(p.stdout):
            log.error(p.stdout)
        else:
            log.debug(p.stdout)

        if p.returncode != 0:
            log.error('Ghostscript rasterizing failed')
            raise SubprocessOutputError()

        # Ghostscript only accepts integers for output resolution
        # if the resolution happens to be fractional, then the discrepancy
        # would change the size of the output page, especially if the DPI
        # is quite low. Resize the image to the expected size

        tmp.seek(0)
        with Image.open(tmp) as im:
            expected_size = round(im.size[0] / int_res[0] * res[0]), \
                            round(im.size[1] / int_res[1] * res[1])
            if expected_size != im.size or page_dpi != (xres, yres):
                log.debug(
                    "Ghostscript: resize output image {} -> {}".format(
                        im.size, expected_size))
                im.resize(expected_size).save(
                    fspath(output_file), dpi=page_dpi)
            else:
                copy(tmp.name, fspath(output_file))


def generate_pdfa(pdf_pages, output_file, compression, log,
                  threads=1, pdf_version='1.5', pdfa_part='2'):
    compression_args = []
    if compression == 'jpeg':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/DCTEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/DCTEncode",
        ]
    elif compression == 'lossless':
        compression_args = [
            "-dAutoFilterColorImages=false",
            "-dColorImageFilter=/FlateEncode",
            "-dAutoFilterGrayImages=false",
            "-dGrayImageFilter=/FlateEncode",
        ]
    else:
        compression_args = [
            "-dAutoFilterColorImages=true",
            "-dAutoFilterGrayImages=true",
        ]

    # Older versions of Ghostscript expect a leading slash in
    # sColorConversionStrategy, newer ones should not have it. See Ghostscript
    # git commit fe1c025d.
    strategy = 'RGB' if version() >= '9.19' else '/RGB'

    if version() == '9.23':
        # 9.23: new feature JPEG passthrough is broken in some cases, best to
        # disable it always
        # https://bugs.ghostscript.com/show_bug.cgi?id=699216
        compression_args.append('-dPassThroughJPEGImages=false')

    with NamedTemporaryFile(delete=True) as gs_pdf:
        args_gs = [
            "gs",
            "-dQUIET",
            "-dBATCH",
            "-dNOPAUSE",
            "-dCompatibilityLevel=" + str(pdf_version),
            "-dNumRenderingThreads=" + str(threads),
            "-sDEVICE=pdfwrite",
            "-dAutoRotatePages=/None",
            "-sColorConversionStrategy=" + strategy,
            "-sProcessColorModel=DeviceRGB"
        ] + compression_args + [
            "-dJPEGQ=95",
            "-dPDFA=" + pdfa_part,
            "-dPDFACompatibilityPolicy=1",
            "-sOutputFile=" + gs_pdf.name,
        ]
        args_gs.extend(pdf_pages)
        p = run(args_gs, stdout=PIPE, stderr=STDOUT,
                universal_newlines=True)

        if _gs_error_reported(p.stdout):
            log.error(p.stdout)
        elif 'overprint mode not set' in p.stdout:
            # Unless someone is going to print PDF/A documents on a
            # magical sRGB printer I can't see the removal of overprinting
            # being a problem....
            log.debug(
                "Ghostscript had to remove PDF 'overprinting' from the "
                "input file to complete PDF/A conversion. "
                )
        else:
            log.debug(p.stdout)

        if p.returncode == 0:
            # Ghostscript does not change return code when it fails to create
            # PDF/A - check PDF/A status elsewhere
            copy(gs_pdf.name, output_file)
        else:
            log.error('Ghostscript PDF/A rendering failed')
            raise SubprocessOutputError()