From 7a1cd39b21002b29dccffccab0afeb17743b3e30 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Mon, 2 Apr 2018 17:53:39 -0700 Subject: [PATCH] Fix creation date metadata lost from input Closes #247 --- src/ocrmypdf/pdfa.py | 83 ++++++++++++++++++++++++++++++++++++++-- src/ocrmypdf/pipeline.py | 14 +++---- tests/test_metadata.py | 40 ++++++++++++++++++- 3 files changed, 125 insertions(+), 12 deletions(-) diff --git a/src/ocrmypdf/pdfa.py b/src/ocrmypdf/pdfa.py index 68e0b190..e4947bfa 100644 --- a/src/ocrmypdf/pdfa.py +++ b/src/ocrmypdf/pdfa.py @@ -19,6 +19,8 @@ from string import Template from binascii import hexlify +from datetime import datetime +from xml.parsers.expat import ExpatError import pkg_resources import PyPDF2 as pypdf @@ -98,7 +100,79 @@ def encode_text_string(s: str) -> str: return ascii_hex_str +def encode_pdf_date(d: datetime) -> str: + """Encode Python datetime object as PDF date string + + From Adobe pdfmark manual: + (D:YYYYMMDDHHmmSSOHH'mm') + D: is an optional prefix. YYYY is the year. All fields after the year are + optional. MM is the month (01-12), DD is the day (01-31), HH is the + hour (00-23), mm are the minutes (00-59), and SS are the seconds + (00-59). The remainder of the string defines the relation of local + time to GMT. O is either + for a positive difference (local time is + later than GMT) or - (minus) for a negative difference. HH' is the + absolute value of the offset from GMT in hours, and mm' is the + absolute value of the offset in minutes. If no GMT information is + specified, the relation between the specified time and GMT is + considered unknown. Regardless of whether or not GMT + information is specified, the remainder of the string should specify + the local time. + """ + + pdfmark_date_fmt = r'%Y%m%d%H%M%S' + s = d.strftime(pdfmark_date_fmt) + + tz = d.strftime('%z') + if tz == 'Z': + s += "+00'00'" + elif tz != '': + sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5] + s += "{}{}'{tz}'".format(sign, tz_hours, tz_mins) + return s + + +def decode_pdf_date(s: str) -> datetime: + pdfmark_date_fmts = ( + r'%Y%m%d%H%M%S%z', # +0430 etc + r'%Y%m%d%H%M%S', # no time zone + r'%Y%m%d%H%M%SZ') # trailing Z + + if s.startswith('D:'): + s = s[2:] + for fmt in pdfmark_date_fmts: + try: + return datetime.strptime(s, fmt) + except ValueError: + continue + return None + + +def _get_pdfmark_dates(pdfmark): + """Encode dates for pdfmark Postscript. The best way to deal with a + missing date entry is set it to null, because if the key is omitted + Ghostscript will set it to now - we do not want to erase the fact that + the value was unknown. Setting to an empty string breaks Ghostscript + 9.22 as reported here: + https://bugs.ghostscript.com/show_bug.cgi?id=699182 + """ + + for key in ('/CreationDate', '/ModDate'): + if key not in pdfmark: + continue + if pdfmark[key].strip() == '': + yield ' {} null'.format(key) + continue + date_str = pdfmark[key] + if date_str.startswith('D:'): + date_str = date_str[2:] + yield ' {} (D:{})'.format(key, date_str) + + def _get_pdfa_def(icc_profile, icc_identifier, pdfmark): + """Create a Postscript file for Ghostscript. pdfmark contains the various + objects as strings; these must be encoded in ASCII, and dates have a + special format.""" + # Ghostscript <= 9.21 has a bug where null entries in DOCINFO might produce # ERROR: VMerror (-25) on closing pdfwrite device. # https://bugs.ghostscript.com/show_bug.cgi?id=697684 @@ -107,12 +181,12 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark): docinfo_line_template = ' {key} <{value}>' def docinfo_gen(): + yield from _get_pdfmark_dates(pdfmark) for key in docinfo_keys: if key in pdfmark and pdfmark[key].strip() != '': line = docinfo_line_template.format( key=key, value=encode_text_string(pdfmark[key])) yield line - docinfo = '\n'.join(docinfo_gen()) t = Template(pdfa_def_template) @@ -145,9 +219,12 @@ def file_claims_pdfa(filename): This checks if the XMP metadata contains a PDF/A marker. """ - pdf = pypdf.PdfFileReader(filename) - xmp = pdf.getXmpMetadata() + try: + xmp = pdf.getXmpMetadata() + except ExpatError: + return {'pass': False, 'output': 'pdf', + 'conformance': 'Invalid XML metadata'} try: pdfa_nodes = xmp.getNodesInNamespace( diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py index a9bdb4d5..11459f21 100644 --- a/src/ocrmypdf/pipeline.py +++ b/src/ocrmypdf/pipeline.py @@ -18,6 +18,7 @@ from contextlib import suppress from shutil import copyfileobj from pathlib import Path +from datetime import datetime import sys import os import shutil @@ -31,7 +32,7 @@ from ruffus import formatter, regex, Pipeline, suffix from .hocrtransform import HocrTransform from .pdfinfo import PdfInfo, Encoding, Colorspace -from .pdfa import generate_pdfa_ps +from .pdfa import generate_pdfa_ps, encode_pdf_date from .helpers import re_symlink, is_iterable_notstr, page_number from .exec import ghostscript, tesseract, qpdf from .lib import fitz @@ -871,12 +872,8 @@ def get_pdfmark(base_pdf, options): except (KeyError, TypeError): return '' - pdfmark = { - '/Title': from_document_info('/Title'), - '/Author': from_document_info('/Author'), - '/Keywords': from_document_info('/Keywords'), - '/Subject': from_document_info('/Subject'), - } + pdfmark = {k: from_document_info(k) for k in + ('/Title', '/Author', '/Keywords', '/Subject', '/CreationDate')} if options.title: pdfmark['/Title'] = options.title if options.author: @@ -897,6 +894,7 @@ def get_pdfmark(base_pdf, options): PROGRAM_NAME, VERSION, renderer_tag, tesseract.version()) + pdfmark['/ModDate'] = encode_pdf_date(datetime.utcnow()) return pdfmark @@ -1030,7 +1028,7 @@ def merge_pages_mupdf( reader_metadata = pypdf.PdfFileReader(metadata_file) pdfmark = get_pdfmark(reader_metadata, options) pdfmark['/Producer'] = 'PyMuPDF ' + fitz.version[0] - pymupdf_metadata = {k[1:].lower() : v for k, v in pdfmark.items()} + pymupdf_metadata = {(k[1].lower() + k[2:]) : v for k, v in pdfmark.items()} for pdf_page in pdf_pages: page = fitz.open(pdf_page) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 64b8afb4..ebcb78d1 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -18,8 +18,9 @@ import pytest import PyPDF2 as pypdf +import datetime -from ocrmypdf.pdfa import file_claims_pdfa +from ocrmypdf.pdfa import file_claims_pdfa, encode_pdf_date, decode_pdf_date from ocrmypdf.exceptions import ExitCode from ocrmypdf.lib import fitz @@ -115,3 +116,40 @@ def test_bookmarks_preserved(spoof_tesseract_noop, output_type, ocr_option, print(before_toc) print(after_toc) assert before_toc == after_toc + + +def seconds_between_dates(date1, date2): + return (date2 - date1).total_seconds() + + +@pytest.mark.parametrize('infile', ['trivial.pdf', 'jbig2.pdf']) +@pytest.mark.parametrize('output_type', ['pdf', 'pdfa']) +def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources, + infile, outpdf): + input_file = resources / infile + + before = pypdf.PdfFileReader(str(input_file)).getDocumentInfo() + check_ocrmypdf( + input_file, outpdf, '--output-type', output_type, + env=spoof_tesseract_noop) + after = pypdf.PdfFileReader(str(outpdf)).getDocumentInfo() + + if not before: + # If there was input creation date, none should be output + # because of Ghostscript quirks we set it to null + # This test would be better if we had a test file with /DocumentInfo but + # no /CreationDate, which we don't + assert not after['/CreationDate'] or \ + isinstance(after['/CreationDate'], pypdf.generic.NullObject) + else: + # We expect that the creation date stayed the same + date_before = decode_pdf_date(before['/CreationDate']) + date_after = decode_pdf_date(after['/CreationDate']) + assert seconds_between_dates(date_before, date_after) < 1000 + + # We expect that the modified date is quite recent + date_after = decode_pdf_date(after['/ModDate']) + assert seconds_between_dates( + date_after, datetime.datetime.utcnow()) < 1000 + +