From 7a1cd39b21002b29dccffccab0afeb17743b3e30 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Mon, 2 Apr 2018 17:53:39 -0700
Subject: [PATCH] Fix creation date metadata lost from input

Closes #247
---
 src/ocrmypdf/pdfa.py     | 83 ++++++++++++++++++++++++++++++++++++++--
 src/ocrmypdf/pipeline.py | 14 +++----
 tests/test_metadata.py   | 40 ++++++++++++++++++-
 3 files changed, 125 insertions(+), 12 deletions(-)

diff --git a/src/ocrmypdf/pdfa.py b/src/ocrmypdf/pdfa.py
index 68e0b190..e4947bfa 100644
--- a/src/ocrmypdf/pdfa.py
+++ b/src/ocrmypdf/pdfa.py
@@ -19,6 +19,8 @@
 
 from string import Template
 from binascii import hexlify
+from datetime import datetime
+from xml.parsers.expat import ExpatError
 import pkg_resources
 import PyPDF2 as pypdf
 
@@ -98,7 +100,79 @@ def encode_text_string(s: str) -> str:
     return ascii_hex_str
 
 
+def encode_pdf_date(d: datetime) -> str:
+    """Encode Python datetime object as PDF date string
+
+    From Adobe pdfmark manual:    
+    (D:YYYYMMDDHHmmSSOHH'mm')
+    D: is an optional prefix. YYYY is the year. All fields after the year are
+    optional. MM is the month (01-12), DD is the day (01-31), HH is the
+    hour (00-23), mm are the minutes (00-59), and SS are the seconds
+    (00-59). The remainder of the string defines the relation of local
+    time to GMT. O is either + for a positive difference (local time is
+    later than GMT) or - (minus) for a negative difference. HH' is the
+    absolute value of the offset from GMT in hours, and mm' is the
+    absolute value of the offset in minutes. If no GMT information is
+    specified, the relation between the specified time and GMT is
+    considered unknown. Regardless of whether or not GMT
+    information is specified, the remainder of the string should specify
+    the local time.
+    """
+
+    pdfmark_date_fmt = r'%Y%m%d%H%M%S'
+    s = d.strftime(pdfmark_date_fmt)
+
+    tz = d.strftime('%z')
+    if tz == 'Z':
+        s += "+00'00'"
+    elif tz != '':
+        sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]
+        s += "{}{}'{tz}'".format(sign, tz_hours, tz_mins)
+    return s
+
+
+def decode_pdf_date(s: str) -> datetime:
+    pdfmark_date_fmts = (
+        r'%Y%m%d%H%M%S%z',  # +0430 etc
+        r'%Y%m%d%H%M%S',    # no time zone
+        r'%Y%m%d%H%M%SZ')   # trailing Z
+
+    if s.startswith('D:'):
+        s = s[2:]
+    for fmt in pdfmark_date_fmts:
+        try:
+            return datetime.strptime(s, fmt)
+        except ValueError:
+            continue
+    return None
+
+
+def _get_pdfmark_dates(pdfmark):
+    """Encode dates for pdfmark Postscript.  The best way to deal with a
+    missing date entry is set it to null, because if the key is omitted 
+    Ghostscript will set it to now - we do not want to erase the fact that
+    the value was unknown.  Setting to an empty string breaks Ghostscript
+    9.22 as reported here:
+    https://bugs.ghostscript.com/show_bug.cgi?id=699182
+    """
+
+    for key in ('/CreationDate', '/ModDate'):
+        if key not in pdfmark:
+            continue
+        if pdfmark[key].strip() == '':
+            yield '  {} null'.format(key)
+            continue
+        date_str = pdfmark[key]
+        if date_str.startswith('D:'):
+            date_str = date_str[2:]
+        yield '  {} (D:{})'.format(key, date_str)
+
+
 def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
+    """Create a Postscript file for Ghostscript.  pdfmark contains the various
+    objects as strings; these must be encoded in ASCII, and dates have a 
+    special format."""
+
     # Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce
     # ERROR: VMerror (-25) on closing pdfwrite device.
     # https://bugs.ghostscript.com/show_bug.cgi?id=697684
@@ -107,12 +181,12 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
     docinfo_line_template = '  {key} <{value}>'
 
     def docinfo_gen():
+        yield from _get_pdfmark_dates(pdfmark)
         for key in docinfo_keys:
             if key in pdfmark and pdfmark[key].strip() != '':
                 line = docinfo_line_template.format(
                     key=key, value=encode_text_string(pdfmark[key]))
                 yield line
-
     docinfo = '\n'.join(docinfo_gen())
 
     t = Template(pdfa_def_template)
@@ -145,9 +219,12 @@ def file_claims_pdfa(filename):
 
     This checks if the XMP metadata contains a PDF/A marker.
     """
-
     pdf = pypdf.PdfFileReader(filename)
-    xmp = pdf.getXmpMetadata()
+    try:
+        xmp = pdf.getXmpMetadata()
+    except ExpatError:
+        return {'pass': False, 'output': 'pdf',
+                'conformance': 'Invalid XML metadata'}
 
     try:
         pdfa_nodes = xmp.getNodesInNamespace(
diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py
index a9bdb4d5..11459f21 100644
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@@ -18,6 +18,7 @@
 from contextlib import suppress
 from shutil import copyfileobj
 from pathlib import Path
+from datetime import datetime
 import sys
 import os
 import shutil
@@ -31,7 +32,7 @@ from ruffus import formatter, regex, Pipeline, suffix
 
 from .hocrtransform import HocrTransform
 from .pdfinfo import PdfInfo, Encoding, Colorspace
-from .pdfa import generate_pdfa_ps
+from .pdfa import generate_pdfa_ps, encode_pdf_date
 from .helpers import re_symlink, is_iterable_notstr, page_number
 from .exec import ghostscript, tesseract, qpdf
 from .lib import fitz
@@ -871,12 +872,8 @@ def get_pdfmark(base_pdf, options):
         except (KeyError, TypeError):
             return ''
 
-    pdfmark = {
-        '/Title': from_document_info('/Title'),
-        '/Author': from_document_info('/Author'),
-        '/Keywords': from_document_info('/Keywords'),
-        '/Subject': from_document_info('/Subject'),
-    }
+    pdfmark = {k: from_document_info(k) for k in 
+        ('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')}
     if options.title:
         pdfmark['/Title'] = options.title
     if options.author:
@@ -897,6 +894,7 @@ def get_pdfmark(base_pdf, options):
         PROGRAM_NAME, VERSION,
         renderer_tag,
         tesseract.version())
+    pdfmark['/ModDate'] = encode_pdf_date(datetime.utcnow())
     return pdfmark
 
 
@@ -1030,7 +1028,7 @@ def merge_pages_mupdf(
     reader_metadata = pypdf.PdfFileReader(metadata_file)
     pdfmark = get_pdfmark(reader_metadata, options)
     pdfmark['/Producer'] = 'PyMuPDF ' + fitz.version[0]
-    pymupdf_metadata = {k[1:].lower() : v for k, v in pdfmark.items()}
+    pymupdf_metadata = {(k[1].lower() + k[2:]) : v for k, v in pdfmark.items()}
 
     for pdf_page in pdf_pages:
         page = fitz.open(pdf_page)
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index 64b8afb4..ebcb78d1 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -18,8 +18,9 @@
 
 import pytest
 import PyPDF2 as pypdf
+import datetime
 
-from ocrmypdf.pdfa import file_claims_pdfa
+from ocrmypdf.pdfa import file_claims_pdfa, encode_pdf_date, decode_pdf_date
 from ocrmypdf.exceptions import ExitCode
 from ocrmypdf.lib import fitz
 
@@ -115,3 +116,40 @@ def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option,
     print(before_toc)
     print(after_toc)
     assert before_toc == after_toc
+
+
+def seconds_between_dates(date1, date2):
+    return (date2 - date1).total_seconds()
+
+
+@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf'])
+@pytest.mark.parametrize('output_type', ['pdf', 'pdfa'])
+def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,
+                                 infile, outpdf):
+    input_file = resources / infile
+
+    before = pypdf.PdfFileReader(str(input_file)).getDocumentInfo()
+    check_ocrmypdf(
+        input_file, outpdf, '--output-type', output_type, 
+        env=spoof_tesseract_noop)
+    after = pypdf.PdfFileReader(str(outpdf)).getDocumentInfo()
+
+    if not before:
+        # If there was input creation date, none should be output
+        # because of Ghostscript quirks we set it to null
+        # This test would be better if we had a test file with /DocumentInfo but
+        # no /CreationDate, which we don't
+        assert not after['/CreationDate'] or \
+                isinstance(after['/CreationDate'], pypdf.generic.NullObject)
+    else:
+        # We expect that the creation date stayed the same
+        date_before = decode_pdf_date(before['/CreationDate'])
+        date_after = decode_pdf_date(after['/CreationDate'])
+        assert seconds_between_dates(date_before, date_after) < 1000
+
+    # We expect that the modified date is quite recent
+    date_after = decode_pdf_date(after['/ModDate'])
+    assert seconds_between_dates(
+        date_after, datetime.datetime.utcnow()) < 1000
+
+