Workaround for GS VMerror -25 bug

Avoid inserting docinfo keys that would be translated to null strings, to avoid running afoul of https://bugs.ghostscript.com/show_bug.cgi?id=697684
2026-05-08 06:36:20 -04:00 · 2017-03-28 11:05:43 -07:00
parent 45e9257d6e
commit e71e8ca3ad
2 changed files with 22 additions and 37 deletions
--- a/ocrmypdf/pdfa.py
+++ b/ocrmypdf/pdfa.py
@@ -26,11 +26,7 @@ pdfa_def_template = u"""%!
 /ICCProfile ($icc_profile)
 def

-[ /Title <$title>
-  /Author <$author>
-  /Subject <$subject>
-  /Keywords <$keywords>
-  /Creator <$creator>
+[$docinfo
  /DOCINFO pdfmark

 % Define an ICC profile :
@@ -89,16 +85,26 @@ def encode_text_string(s: str) -> str:


 def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
-    pdfmark_utf16 = {k: encode_text_string(v) for k, v in pdfmark.items()}
+    # Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce
+    # ERROR: VMerror (-25) on closing pdfwrite device.
+    # https://bugs.ghostscript.com/show_bug.cgi?id=697684
+    # Work around this by only adding keys that have a nontrivial value
+    docinfo_keys = ('/Title', '/Author', '/Subject', '/Creator', '/Keywords')
+    docinfo_line_template = '  {key} <{value}>'
+
+    def docinfo_gen():
+        for key in docinfo_keys:
+            if key in pdfmark and pdfmark[key].strip() != '':
+                line = docinfo_line_template.format(
+                    key=key, value=encode_text_string(pdfmark[key]))
+                yield line
+
+    docinfo = '\n'.join(docinfo_gen())

    t = Template(pdfa_def_template)
    result = t.substitute(icc_profile=icc_profile,
                          icc_identifier=icc_identifier,
-                          title=pdfmark_utf16.get('/Title', ''),
-                          author=pdfmark_utf16.get('/Author', ''),
-                          subject=pdfmark_utf16.get('/Subject', ''),
-                          creator=pdfmark_utf16.get('/Creator', ''),
-                          keywords=pdfmark_utf16.get('/Keywords', ''))
+                          docinfo=docinfo)
    return result


--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -167,52 +167,31 @@ def test_preserve_metadata(spoof_tesseract_noop, output_type,
    assert pdfa_info['output'] == output_type


-@pytest.mark.skipif(
-    pytest.helpers.is_linux() and not pytest.helpers.running_in_docker(),
-    reason="likely to fail if Linux locale is not configured correctly")
-@pytest.mark.skipif(
-    pytest.helpers.is_macos() and pytest.helpers.running_in_travis(),
-    reason="save Travis the trouble of installing poppler")
-@pytest.mark.xfail(
-    ghostscript.version() == '9.21',
-    reason="gs 9.21 has a regression that affects this"
-    )
@pytest.mark.parametrize("output_type", [
    'pdfa', 'pdf'
    ])
 def test_override_metadata(spoof_tesseract_noop, output_type, resources,
                           outpdf):
    input_file = resources / 'c02-22.pdf'
-
    german = 'Du siehst den Wald vor lauter Bäumen nicht.'
    chinese = '孔子'
-    high_unicode = 'U+1030C is: 𐌌'

    p, out, err = run_ocrmypdf(
        input_file, outpdf,
        '--title', german,
        '--author', chinese,
-        '--subject', high_unicode,
        '--output-type', output_type,
        env=spoof_tesseract_noop)

    assert p.returncode == ExitCode.ok, err

-    pdf = str(outpdf)
+    reader = pypdf.PdfFileReader(outpdf)

-    out_pdfinfo = check_output(['pdfinfo', pdf], universal_newlines=True)
-    lines_pdfinfo = out_pdfinfo.splitlines()
-    pdfinfo = {}
-    for line in lines_pdfinfo:
-        k, v = line.strip().split(':', maxsplit=1)
-        pdfinfo[k.strip()] = v.strip()
+    assert reader.documentInfo['/Title'] == german
+    assert reader.documentInfo['/Author'] == chinese
+    assert reader.documentInfo.get('/Keywords', '') == ''

-    assert pdfinfo['Title'] == german
-    assert pdfinfo['Author'] == chinese
-    assert pdfinfo['Subject'] == high_unicode
-    assert pdfinfo.get('Keywords', '') == ''
-
-    pdfa_info = file_claims_pdfa(pdf)
+    pdfa_info = file_claims_pdfa(outpdf)
    assert pdfa_info['output'] == output_type