From e71e8ca3ada46860a69ff251093cc6566d3ecd09 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Tue, 28 Mar 2017 11:05:43 -0700
Subject: [PATCH] Workaround for GS VMerror -25 bug

Avoid inserting docinfo keys that would be translated to null strings,
to avoid running afoul of
https://bugs.ghostscript.com/show_bug.cgi?id=697684
---
 ocrmypdf/pdfa.py   | 28 +++++++++++++++++-----------
 tests/test_main.py | 31 +++++--------------------------
 2 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/ocrmypdf/pdfa.py b/ocrmypdf/pdfa.py
index 09b25cab..b10262f8 100644
--- a/ocrmypdf/pdfa.py
+++ b/ocrmypdf/pdfa.py
@@ -26,11 +26,7 @@ pdfa_def_template = u"""%!
 /ICCProfile ($icc_profile)
 def
 
-[ /Title <$title>
-  /Author <$author>
-  /Subject <$subject>
-  /Keywords <$keywords>
-  /Creator <$creator>
+[$docinfo
   /DOCINFO pdfmark
 
 % Define an ICC profile :
@@ -89,16 +85,26 @@ def encode_text_string(s: str) -> str:
 
 
 def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
-    pdfmark_utf16 = {k: encode_text_string(v) for k, v in pdfmark.items()}
+    # Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce
+    # ERROR: VMerror (-25) on closing pdfwrite device.
+    # https://bugs.ghostscript.com/show_bug.cgi?id=697684
+    # Work around this by only adding keys that have a nontrivial value
+    docinfo_keys = ('/Title', '/Author', '/Subject', '/Creator', '/Keywords')
+    docinfo_line_template = '  {key} <{value}>'
+
+    def docinfo_gen():
+        for key in docinfo_keys:
+            if key in pdfmark and pdfmark[key].strip() != '':
+                line = docinfo_line_template.format(
+                    key=key, value=encode_text_string(pdfmark[key]))
+                yield line
+
+    docinfo = '\n'.join(docinfo_gen())
 
     t = Template(pdfa_def_template)
     result = t.substitute(icc_profile=icc_profile,
                           icc_identifier=icc_identifier,
-                          title=pdfmark_utf16.get('/Title', ''),
-                          author=pdfmark_utf16.get('/Author', ''),
-                          subject=pdfmark_utf16.get('/Subject', ''),
-                          creator=pdfmark_utf16.get('/Creator', ''),
-                          keywords=pdfmark_utf16.get('/Keywords', ''))
+                          docinfo=docinfo)
     return result
 
 
diff --git a/tests/test_main.py b/tests/test_main.py
index 730a1609..767b4f23 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -167,52 +167,31 @@ def test_preserve_metadata(spoof_tesseract_noop, output_type,
     assert pdfa_info['output'] == output_type
 
 
-@pytest.mark.skipif(
-    pytest.helpers.is_linux() and not pytest.helpers.running_in_docker(),
-    reason="likely to fail if Linux locale is not configured correctly")
-@pytest.mark.skipif(
-    pytest.helpers.is_macos() and pytest.helpers.running_in_travis(),
-    reason="save Travis the trouble of installing poppler")
-@pytest.mark.xfail(
-    ghostscript.version() == '9.21',
-    reason="gs 9.21 has a regression that affects this"
-    )
 @pytest.mark.parametrize("output_type", [
     'pdfa', 'pdf'
     ])
 def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                            outpdf):
     input_file = resources / 'c02-22.pdf'
-
     german = 'Du siehst den Wald vor lauter Bäumen nicht.'
     chinese = '孔子'
-    high_unicode = 'U+1030C is: 𐌌'
 
     p, out, err = run_ocrmypdf(
         input_file, outpdf,
         '--title', german,
         '--author', chinese,
-        '--subject', high_unicode,
         '--output-type', output_type,
         env=spoof_tesseract_noop)
 
     assert p.returncode == ExitCode.ok, err
 
-    pdf = str(outpdf)
+    reader = pypdf.PdfFileReader(outpdf)
 
-    out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
-    lines_pdfinfo = out_pdfinfo.splitlines()
-    pdfinfo = {}
-    for line in lines_pdfinfo:
-        k, v = line.strip().split(':', maxsplit=1)
-        pdfinfo[k.strip()] = v.strip()
+    assert reader.documentInfo['/Title'] == german
+    assert reader.documentInfo['/Author'] == chinese
+    assert reader.documentInfo.get('/Keywords', '') == ''
 
-    assert pdfinfo['Title'] == german
-    assert pdfinfo['Author'] == chinese
-    assert pdfinfo['Subject'] == high_unicode
-    assert pdfinfo.get('Keywords', '') == ''
-
-    pdfa_info = file_claims_pdfa(pdf)
+    pdfa_info = file_claims_pdfa(outpdf)
     assert pdfa_info['output'] == output_type