Workaround for GS VMerror -25 bug

Avoid inserting docinfo keys that would be translated to null strings,
to avoid running afoul of
https://bugs.ghostscript.com/show_bug.cgi?id=697684
This commit is contained in:
James R. Barlow
2017-03-28 11:05:43 -07:00
parent 45e9257d6e
commit e71e8ca3ad
2 changed files with 22 additions and 37 deletions

View File

@@ -26,11 +26,7 @@ pdfa_def_template = u"""%!
/ICCProfile ($icc_profile)
def
[ /Title <$title>
/Author <$author>
/Subject <$subject>
/Keywords <$keywords>
/Creator <$creator>
[$docinfo
/DOCINFO pdfmark
% Define an ICC profile :
@@ -89,16 +85,26 @@ def encode_text_string(s: str) -> str:
def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
pdfmark_utf16 = {k: encode_text_string(v) for k, v in pdfmark.items()}
# Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce
# ERROR: VMerror (-25) on closing pdfwrite device.
# https://bugs.ghostscript.com/show_bug.cgi?id=697684
# Work around this by only adding keys that have a nontrivial value
docinfo_keys = ('/Title', '/Author', '/Subject', '/Creator', '/Keywords')
docinfo_line_template = ' {key} <{value}>'
def docinfo_gen():
for key in docinfo_keys:
if key in pdfmark and pdfmark[key].strip() != '':
line = docinfo_line_template.format(
key=key, value=encode_text_string(pdfmark[key]))
yield line
docinfo = '\n'.join(docinfo_gen())
t = Template(pdfa_def_template)
result = t.substitute(icc_profile=icc_profile,
icc_identifier=icc_identifier,
title=pdfmark_utf16.get('/Title', ''),
author=pdfmark_utf16.get('/Author', ''),
subject=pdfmark_utf16.get('/Subject', ''),
creator=pdfmark_utf16.get('/Creator', ''),
keywords=pdfmark_utf16.get('/Keywords', ''))
docinfo=docinfo)
return result

View File

@@ -167,52 +167,31 @@ def test_preserve_metadata(spoof_tesseract_noop, output_type,
assert pdfa_info['output'] == output_type
@pytest.mark.skipif(
pytest.helpers.is_linux() and not pytest.helpers.running_in_docker(),
reason="likely to fail if Linux locale is not configured correctly")
@pytest.mark.skipif(
pytest.helpers.is_macos() and pytest.helpers.running_in_travis(),
reason="save Travis the trouble of installing poppler")
@pytest.mark.xfail(
ghostscript.version() == '9.21',
reason="gs 9.21 has a regression that affects this"
)
@pytest.mark.parametrize("output_type", [
'pdfa', 'pdf'
])
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
outpdf):
input_file = resources / 'c02-22.pdf'
german = 'Du siehst den Wald vor lauter Bäumen nicht.'
chinese = '孔子'
high_unicode = 'U+1030C is: 𐌌'
p, out, err = run_ocrmypdf(
input_file, outpdf,
'--title', german,
'--author', chinese,
'--subject', high_unicode,
'--output-type', output_type,
env=spoof_tesseract_noop)
assert p.returncode == ExitCode.ok, err
pdf = str(outpdf)
reader = pypdf.PdfFileReader(outpdf)
out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
lines_pdfinfo = out_pdfinfo.splitlines()
pdfinfo = {}
for line in lines_pdfinfo:
k, v = line.strip().split(':', maxsplit=1)
pdfinfo[k.strip()] = v.strip()
assert reader.documentInfo['/Title'] == german
assert reader.documentInfo['/Author'] == chinese
assert reader.documentInfo.get('/Keywords', '') == ''
assert pdfinfo['Title'] == german
assert pdfinfo['Author'] == chinese
assert pdfinfo['Subject'] == high_unicode
assert pdfinfo.get('Keywords', '') == ''
pdfa_info = file_claims_pdfa(pdf)
pdfa_info = file_claims_pdfa(outpdf)
assert pdfa_info['output'] == output_type