Files
OCRmyPDF/ocrmypdf/exec/qpdf.py
2018-03-09 11:10:45 -08:00

204 lines
6.4 KiB
Python

# © 2017 James R. Barlow: github.com/jbarlow83
from subprocess import CalledProcessError, STDOUT, PIPE, run
from functools import lru_cache
import sys
import os
import re
import resource
from ..exceptions import InputFileError, SubprocessOutputError, \
MissingDependencyError, EncryptedPdfError
from . import get_program, get_version
from ..helpers import re_symlink
@lru_cache(maxsize=1)
def version():
return get_version('qpdf', regex=r'qpdf version (.+)')
def check(input_file, log=None):
args_qpdf = [
get_program('qpdf'),
'--check',
input_file
]
if log is None:
import logging as log
try:
run(args_qpdf, stderr=STDOUT, stdout=PIPE, universal_newlines=True,
check=True)
except CalledProcessError as e:
if e.returncode == 2:
log.error("{0}: not a valid PDF, and could not repair it.".format(
input_file))
log.error("Details:")
log.error(e.output)
elif e.returncode == 3:
log.info("qpdf --check returned warnings:")
log.info(e.output)
else:
log.warning(e.output)
return False
return True
def _probably_encrypted(e):
"""qpdf can report a false positive "file is encrypted" message for damaged
files - suppress this"""
return e.returncode == 2 and \
'invalid password' in e.output and \
'file is damaged' not in e.output
def repair(input_file, output_file, log):
args_qpdf = [
get_program('qpdf'), input_file, output_file
]
try:
run(args_qpdf, stderr=STDOUT, stdout=PIPE, universal_newlines=True,
check=True)
except CalledProcessError as e:
if e.returncode == 3 and e.output.find("operation succeeded"):
log.debug('qpdf found and fixed errors: ' + e.output)
return
if _probably_encrypted(e):
raise EncryptedPdfError() from e
elif e.returncode == 2:
log.error("{0}: not a valid PDF, and could not repair it.".format(
input_file))
log.error("Details: " + e.output)
raise InputFileError() from e
else:
log.error("{0}: unknown error".format(
input_file))
log.error(e.output)
raise SubprocessOutputError() from e
def get_npages(input_file, log):
try:
pages = run(
[get_program('qpdf'), '--show-npages', input_file],
universal_newlines=True, check=True, stdout=PIPE, stderr=STDOUT)
except CalledProcessError as e:
if e.returncode == 2 and e.output.find('No such file'):
log.error(e.output)
raise InputFileError() from e
return int(pages)
def split_pages(input_file, work_folder, npages):
"""Split multipage PDF into individual pages.
Incredibly enough, this multiple process approach is about 70 times
faster than using Ghostscript.
"""
for n in range(int(npages)):
args_qpdf = [
get_program('qpdf'), input_file,
'--pages', input_file, '{0}'.format(n + 1), '--',
os.path.join(work_folder, '{0:06d}.page.pdf'.format(n + 1))
]
run(args_qpdf, check=True)
def _merge_inner(input_files, output_file, min_version=None, log=None):
"""Merge the list of input files (all filenames) into the output file.
The input files may contain one or more pages.
"""
# Single page 'merges' should still be attempted to that the same error
# checking is applied to single page case
version_arg = ['--min-version={}'.format(min_version)] \
if min_version else []
if log is None:
import logging as log
args_qpdf = [
get_program('qpdf')
] + version_arg + [
input_files[0], '--pages'
] + input_files + ['--', output_file]
try:
run(args_qpdf, check=True, stderr=PIPE, universal_newlines=True)
except CalledProcessError as e:
if e.returncode == 3 and \
e.stderr.find("unknown token while reading object") and \
e.stderr.find("operation succeeded"):
# Only whitelist the 'unknown token' problem (decimal/string issue)
# qpdf issue #165
log.warning('qpdf found and fixed errors: ' + e.stderr)
return
raise e from e
def merge(input_files, output_file, min_version=None, log=None, max_files=None):
"""Merge the list of input files (all filenames) into the output file.
The input files may contain one or more pages.
"""
# qpdf requires that every file that contributes to the output has a file
# descriptor that remains open. That means, given our approach of one
# intermediate PDF per, we can practically hit the number of file
# descriptors.
if max_files is None or max_files < 2:
# Find out how many open file descriptors we can get away with
ulimits = resource.getrlimit(resource.RLIMIT_NOFILE)
max_open_files = ulimits[0]
max_files = max_open_files // 2 # Conservative guess
# We'll write things alongside the output file
output_dir = os.path.dirname(output_file)
import random
import string
def randstr():
return ''.join(random.sample(string.ascii_lowercase, 6))
# How many files to grab at once, merging all their contents
step_size = max_files
workqueue = input_files.copy()
counter = 1
next_workqueue = []
while len(workqueue) > 1 or len(next_workqueue) > 0:
# Take n files out of the queue
n = min(step_size, len(workqueue))
job = workqueue[0:n]
del workqueue[0:n]
log.debug('merging ' + repr(job))
# Merge them into 1 file, which will contain n^depth pages
merge_file = os.path.join(
output_dir, "merge-{:06d}-{}.pdf".format(counter, randstr()))
counter += 1
_merge_inner(job, merge_file, min_version=min_version, log=log)
# On the next
next_workqueue.append(merge_file)
log.debug('next_workqueue ' + repr(next_workqueue))
# If we're out of things to do in this queue, move on to the next
# queue. On the counter-th pass of the workqueue we can chew through
# (step_size)**N pages, so on most systems the second pass finishes
# the job.
if len(workqueue) == 0:
workqueue = next_workqueue
next_workqueue = []
re_symlink(workqueue.pop(), output_file, log)