mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-08 06:36:20 -04:00
Workaround for GS VMerror -25 bug
Avoid inserting docinfo keys that would be translated to null strings, to avoid running afoul of https://bugs.ghostscript.com/show_bug.cgi?id=697684
This commit is contained in:
@@ -26,11 +26,7 @@ pdfa_def_template = u"""%!
|
||||
/ICCProfile ($icc_profile)
|
||||
def
|
||||
|
||||
[ /Title <$title>
|
||||
/Author <$author>
|
||||
/Subject <$subject>
|
||||
/Keywords <$keywords>
|
||||
/Creator <$creator>
|
||||
[$docinfo
|
||||
/DOCINFO pdfmark
|
||||
|
||||
% Define an ICC profile :
|
||||
@@ -89,16 +85,26 @@ def encode_text_string(s: str) -> str:
|
||||
|
||||
|
||||
def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
|
||||
pdfmark_utf16 = {k: encode_text_string(v) for k, v in pdfmark.items()}
|
||||
# Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce
|
||||
# ERROR: VMerror (-25) on closing pdfwrite device.
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=697684
|
||||
# Work around this by only adding keys that have a nontrivial value
|
||||
docinfo_keys = ('/Title', '/Author', '/Subject', '/Creator', '/Keywords')
|
||||
docinfo_line_template = ' {key} <{value}>'
|
||||
|
||||
def docinfo_gen():
|
||||
for key in docinfo_keys:
|
||||
if key in pdfmark and pdfmark[key].strip() != '':
|
||||
line = docinfo_line_template.format(
|
||||
key=key, value=encode_text_string(pdfmark[key]))
|
||||
yield line
|
||||
|
||||
docinfo = '\n'.join(docinfo_gen())
|
||||
|
||||
t = Template(pdfa_def_template)
|
||||
result = t.substitute(icc_profile=icc_profile,
|
||||
icc_identifier=icc_identifier,
|
||||
title=pdfmark_utf16.get('/Title', ''),
|
||||
author=pdfmark_utf16.get('/Author', ''),
|
||||
subject=pdfmark_utf16.get('/Subject', ''),
|
||||
creator=pdfmark_utf16.get('/Creator', ''),
|
||||
keywords=pdfmark_utf16.get('/Keywords', ''))
|
||||
docinfo=docinfo)
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -167,52 +167,31 @@ def test_preserve_metadata(spoof_tesseract_noop, output_type,
|
||||
assert pdfa_info['output'] == output_type
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
pytest.helpers.is_linux() and not pytest.helpers.running_in_docker(),
|
||||
reason="likely to fail if Linux locale is not configured correctly")
|
||||
@pytest.mark.skipif(
|
||||
pytest.helpers.is_macos() and pytest.helpers.running_in_travis(),
|
||||
reason="save Travis the trouble of installing poppler")
|
||||
@pytest.mark.xfail(
|
||||
ghostscript.version() == '9.21',
|
||||
reason="gs 9.21 has a regression that affects this"
|
||||
)
|
||||
@pytest.mark.parametrize("output_type", [
|
||||
'pdfa', 'pdf'
|
||||
])
|
||||
def test_override_metadata(spoof_tesseract_noop, output_type, resources,
|
||||
outpdf):
|
||||
input_file = resources / 'c02-22.pdf'
|
||||
|
||||
german = 'Du siehst den Wald vor lauter Bäumen nicht.'
|
||||
chinese = '孔子'
|
||||
high_unicode = 'U+1030C is: 𐌌'
|
||||
|
||||
p, out, err = run_ocrmypdf(
|
||||
input_file, outpdf,
|
||||
'--title', german,
|
||||
'--author', chinese,
|
||||
'--subject', high_unicode,
|
||||
'--output-type', output_type,
|
||||
env=spoof_tesseract_noop)
|
||||
|
||||
assert p.returncode == ExitCode.ok, err
|
||||
|
||||
pdf = str(outpdf)
|
||||
reader = pypdf.PdfFileReader(outpdf)
|
||||
|
||||
out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
|
||||
lines_pdfinfo = out_pdfinfo.splitlines()
|
||||
pdfinfo = {}
|
||||
for line in lines_pdfinfo:
|
||||
k, v = line.strip().split(':', maxsplit=1)
|
||||
pdfinfo[k.strip()] = v.strip()
|
||||
assert reader.documentInfo['/Title'] == german
|
||||
assert reader.documentInfo['/Author'] == chinese
|
||||
assert reader.documentInfo.get('/Keywords', '') == ''
|
||||
|
||||
assert pdfinfo['Title'] == german
|
||||
assert pdfinfo['Author'] == chinese
|
||||
assert pdfinfo['Subject'] == high_unicode
|
||||
assert pdfinfo.get('Keywords', '') == ''
|
||||
|
||||
pdfa_info = file_claims_pdfa(pdf)
|
||||
pdfa_info = file_claims_pdfa(outpdf)
|
||||
assert pdfa_info['output'] == output_type
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user