From e71e8ca3ada46860a69ff251093cc6566d3ecd09 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 28 Mar 2017 11:05:43 -0700 Subject: [PATCH] Workaround for GS VMerror -25 bug Avoid inserting docinfo keys that would be translated to null strings, to avoid running afoul of https://bugs.ghostscript.com/show_bug.cgi?id=697684 --- ocrmypdf/pdfa.py | 28 +++++++++++++++++----------- tests/test_main.py | 31 +++++-------------------------- 2 files changed, 22 insertions(+), 37 deletions(-) diff --git a/ocrmypdf/pdfa.py b/ocrmypdf/pdfa.py index 09b25cab..b10262f8 100644 --- a/ocrmypdf/pdfa.py +++ b/ocrmypdf/pdfa.py @@ -26,11 +26,7 @@ pdfa_def_template = u"""%! /ICCProfile ($icc_profile) def -[ /Title <$title> - /Author <$author> - /Subject <$subject> - /Keywords <$keywords> - /Creator <$creator> +[$docinfo /DOCINFO pdfmark % Define an ICC profile : @@ -89,16 +85,26 @@ def encode_text_string(s: str) -> str: def _get_pdfa_def(icc_profile, icc_identifier, pdfmark): - pdfmark_utf16 = {k: encode_text_string(v) for k, v in pdfmark.items()} + # Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce + # ERROR: VMerror (-25) on closing pdfwrite device. + # https://bugs.ghostscript.com/show_bug.cgi?id=697684 + # Work around this by only adding keys that have a nontrivial value + docinfo_keys = ('/Title', '/Author', '/Subject', '/Creator', '/Keywords') + docinfo_line_template = ' {key} <{value}>' + + def docinfo_gen(): + for key in docinfo_keys: + if key in pdfmark and pdfmark[key].strip() != '': + line = docinfo_line_template.format( + key=key, value=encode_text_string(pdfmark[key])) + yield line + + docinfo = '\n'.join(docinfo_gen()) t = Template(pdfa_def_template) result = t.substitute(icc_profile=icc_profile, icc_identifier=icc_identifier, - title=pdfmark_utf16.get('/Title', ''), - author=pdfmark_utf16.get('/Author', ''), - subject=pdfmark_utf16.get('/Subject', ''), - creator=pdfmark_utf16.get('/Creator', ''), - keywords=pdfmark_utf16.get('/Keywords', '')) + docinfo=docinfo) return result diff --git a/tests/test_main.py b/tests/test_main.py index 730a1609..767b4f23 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -167,52 +167,31 @@ def test_preserve_metadata(spoof_tesseract_noop, output_type, assert pdfa_info['output'] == output_type -@pytest.mark.skipif( - pytest.helpers.is_linux() and not pytest.helpers.running_in_docker(), - reason="likely to fail if Linux locale is not configured correctly") -@pytest.mark.skipif( - pytest.helpers.is_macos() and pytest.helpers.running_in_travis(), - reason="save Travis the trouble of installing poppler") -@pytest.mark.xfail( - ghostscript.version() == '9.21', - reason="gs 9.21 has a regression that affects this" - ) @pytest.mark.parametrize("output_type", [ 'pdfa', 'pdf' ]) def test_override_metadata(spoof_tesseract_noop, output_type, resources, outpdf): input_file = resources / 'c02-22.pdf' - german = 'Du siehst den Wald vor lauter Bäumen nicht.' chinese = '孔子' - high_unicode = 'U+1030C is: 𐌌' p, out, err = run_ocrmypdf( input_file, outpdf, '--title', german, '--author', chinese, - '--subject', high_unicode, '--output-type', output_type, env=spoof_tesseract_noop) assert p.returncode == ExitCode.ok, err - pdf = str(outpdf) + reader = pypdf.PdfFileReader(outpdf) - out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True) - lines_pdfinfo = out_pdfinfo.splitlines() - pdfinfo = {} - for line in lines_pdfinfo: - k, v = line.strip().split(':', maxsplit=1) - pdfinfo[k.strip()] = v.strip() + assert reader.documentInfo['/Title'] == german + assert reader.documentInfo['/Author'] == chinese + assert reader.documentInfo.get('/Keywords', '') == '' - assert pdfinfo['Title'] == german - assert pdfinfo['Author'] == chinese - assert pdfinfo['Subject'] == high_unicode - assert pdfinfo.get('Keywords', '') == '' - - pdfa_info = file_claims_pdfa(pdf) + pdfa_info = file_claims_pdfa(outpdf) assert pdfa_info['output'] == output_type