From 2482296e2bc9065895350a1622c30c4c8827e0a8 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Sat, 14 Apr 2018 17:24:21 -0700
Subject: [PATCH 1/9] hocr: avoid division by zero

Issue #253 - PDF that produces the error is not available, but if font_width
is zero, chances are the text is nonprinting characters, so suppress it.
---
 src/ocrmypdf/hocrtransform.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/ocrmypdf/hocrtransform.py b/src/ocrmypdf/hocrtransform.py
index 4c8bbe90..edd674ff 100755
--- a/src/ocrmypdf/hocrtransform.py
+++ b/src/ocrmypdf/hocrtransform.py
@@ -322,8 +322,11 @@ class HocrTransform():
             dy = baseline_y2 - cursor[1]            
             text.moveCursor(dx, dy)
 
-            text.setHorizScale(100 * box_width / font_width)
-            text.textOut(elemtxt)
+            # If reportlab tells us this word is 0 units wide, our best seems
+            # to be to suppress this text
+            if font_width > 0:
+                text.setHorizScale(100 * box_width / font_width)
+                text.textOut(elemtxt)
         pdf.drawText(text)
 
 

From 9d28879505f58eaab8c4471eb2d72ac03d889f09 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Sat, 14 Apr 2018 17:30:33 -0700
Subject: [PATCH 2/9] Update Ubuntu 14.04 instructions

Closes #252
---
 docs/installation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/installation.rst b/docs/installation.rst
index 58da52e3..d858d926 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -287,7 +287,7 @@ If you prefer to not modify your system in this matter, consider using a Docker
     sudo apt-get update
 
     sudo apt-get install \
-        python3.6 \
+        python3.6-dev \
         ghostscript \
         tesseract-ocr \
         tesseract-ocr-eng \
@@ -298,7 +298,7 @@ Now we need to install ``pip`` and let it install ocrmypdf:
 
 .. code-block:: bash
 
-    wget -O - -o /dev/null https://bootstrap.pypa.io/get-pip.py | python3.6
+    curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && python3.6 -m easy_install pip
     pip3.6 install ocrmypdf[fitz]
 
 The ``wget`` command will download a program and run it.

From 34c78a892ae8cb93f5a2ac8d10e442e4abb4a5a2 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Sun, 15 Apr 2018 23:52:19 -0700
Subject: [PATCH 3/9] Fix list table for tests/resources

[ci skip]
---
 tests/resources/README.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/resources/README.rst b/tests/resources/README.rst
index 55280246..3827c328 100644
--- a/tests/resources/README.rst
+++ b/tests/resources/README.rst
@@ -110,6 +110,7 @@ licensed under the specified license.
         - Wikipedia authors
         - CC-BY-SA 3.0
     *   - missing_docinfo.pdf
+        - @jbarlow83
         - @jbarlow83
         - PDF file with no /DocumentInfo section 
         - CC-BY-SA 4.0

From 7368399f8bea48aaf8bb4be3de46af78ca03de3b Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Mon, 16 Apr 2018 09:56:37 -0700
Subject: [PATCH 4/9] Clarify license of two test files -
 https://github.com/jbarlow83/OCRmyPDF/issues/254

---
 tests/resources/README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/resources/README.rst b/tests/resources/README.rst
index 3827c328..2b6e5683 100644
--- a/tests/resources/README.rst
+++ b/tests/resources/README.rst
@@ -118,7 +118,7 @@ licensed under the specified license.
         - PDF file generated by PDFPen pro that triggered content stream parse errors
         - @maxandersen
         - @maxandersen
-        - MIT
+        - CC-BY-SA 4.0
     *   - negzero.pdf
         - copy of formxobject.pdf with token that qpdf doesn't like
         - @jbarlow83
@@ -138,7 +138,7 @@ licensed under the specified license.
         - a PDF with vector art and text rendered as curves with no fonts
         - @Catscratch
         - @Catscratch
-        - MIT
+        - CC-BY-SA 4.0
 
 
 Assemblies

From a620724d6ae5867f347f0b2acd0d36063401f2c6 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Tue, 17 Apr 2018 14:55:32 -0700
Subject: [PATCH 5/9] Fix PDF/A validation failure due to timezone being
 omitted from /ModDate

---
 src/ocrmypdf/pdfa.py     | 4 +++-
 src/ocrmypdf/pipeline.py | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/ocrmypdf/pdfa.py b/src/ocrmypdf/pdfa.py
index ffd0ff77..67a02d87 100644
--- a/src/ocrmypdf/pdfa.py
+++ b/src/ocrmypdf/pdfa.py
@@ -129,7 +129,9 @@ def encode_pdf_date(d: datetime) -> str:
         s += "+00'00'"
     elif tz != '':
         sign, tz_hours, tz_mins = tz[0], tz[1:3], tz[3:5]
-        s += "{}{}'{tz}'".format(sign, tz_hours, tz_mins)
+        s += "{}{}'{}'".format(sign, tz_hours, tz_mins)
+    else:
+        raise ValueError("Naive timezone not supported")
     return s
 
 
diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py
index cfa94e98..c11c06bd 100644
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@@ -18,7 +18,7 @@
 from contextlib import suppress
 from shutil import copyfileobj
 from pathlib import Path
-from datetime import datetime
+from datetime import datetime, timezone
 import sys
 import os
 import shutil
@@ -900,7 +900,7 @@ def get_pdfmark(base_pdf, options):
         PROGRAM_NAME, VERSION,
         renderer_tag,
         tesseract.version())
-    pdfmark['/ModDate'] = encode_pdf_date(datetime.utcnow())
+    pdfmark['/ModDate'] = encode_pdf_date(datetime.now(timezone.utc))
     return pdfmark
 
 

From 5fde2142904a8bd4f9d169100da4fd03717fcabf Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Tue, 17 Apr 2018 15:23:35 -0700
Subject: [PATCH 6/9] Update notes for v6.1.5

---
 docs/release_notes.rst | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/release_notes.rst b/docs/release_notes.rst
index f9ab4676..5bc73c3a 100644
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -9,10 +9,18 @@ The OCRmyPDF package itself does not contain a public API, although it is fairly
    find:    [^`]\#([0-9]{1,3})[^0-9]  
    replace: `#$1 <https://github.com/jbarlow83/OCRmyPDF/issues/$1>`_
 
+v6.1.5
+------
+
+-   Fix issue `#253 <https://github.com/jbarlow83/OCRmyPDF/issues/248>`_, a possible division by zero when using the ``hocr`` renderer.
+
+-   Fix incorrectly formatted ``<xmp:ModifyDate>`` field inside XMP metadata for PDF/As.  veraPDF flags this as a PDF/A validation failure. The error is caused the timezone and final digit of the seconds of modified time to be omitted, so at worst the modification time stamp is rounded to the nearest 10 seconds.
+
+
 v6.1.4
 ------
 
--   Fix issue #248, ``--clean`` argument may remove OCR from left column of text on certain documents. We now set ``--layout none`` to suppress this.  
+-   Fix issue `#248 <https://github.com/jbarlow83/OCRmyPDF/issues/248>`_ ``--clean`` argument may remove OCR from left column of text on certain documents. We now set ``--layout none`` to suppress this.  
 
 -   The test cache was updated to reflect the change above.
 

From 076363d78eace78d2f5864979a72f799edd22d21 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Tue, 17 Apr 2018 13:54:34 -0700
Subject: [PATCH 7/9] Disable JPEG passthrough for Ghostscript 9.23

Seems to corrupt JPEGs involved in image masks?
---
 src/ocrmypdf/exec/ghostscript.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/ocrmypdf/exec/ghostscript.py b/src/ocrmypdf/exec/ghostscript.py
index 7292034f..e87f7193 100644
--- a/src/ocrmypdf/exec/ghostscript.py
+++ b/src/ocrmypdf/exec/ghostscript.py
@@ -132,6 +132,11 @@ def generate_pdfa(pdf_pages, output_file, compression, log,
     # git commit fe1c025d.
     strategy = 'RGB' if version() >= '9.19' else '/RGB'
 
+    if version() == '9.23':
+        # 9.23: JPEG passthrough broken for image masks?
+        # https://bugs.ghostscript.com/show_bug.cgi?id=699216
+        compression_args.append('-dPassThroughJPEGImages=false')
+
     with NamedTemporaryFile(delete=True) as gs_pdf:
         args_gs = [
             "gs",

From 1a516b2af9bd996729482bece6fcf1e5a3f091c5 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Tue, 17 Apr 2018 16:59:21 -0700
Subject: [PATCH 8/9] Fix regression: time stamp test suite failures

---
 src/ocrmypdf/pdfa.py   | 1 +
 tests/test_metadata.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/ocrmypdf/pdfa.py b/src/ocrmypdf/pdfa.py
index 67a02d87..405a4d4a 100644
--- a/src/ocrmypdf/pdfa.py
+++ b/src/ocrmypdf/pdfa.py
@@ -143,6 +143,7 @@ def decode_pdf_date(s: str) -> datetime:
 
     if s.startswith('D:'):
         s = s[2:]
+    s = s.replace("'", "")  # Remove apos from PDF time strings
     for fmt in pdfmark_date_fmts:
         try:
             return datetime.strptime(s, fmt)
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index ebcb78d1..4c6b4f94 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -19,6 +19,7 @@
 import pytest
 import PyPDF2 as pypdf
 import datetime
+from datetime import timezone
 
 from ocrmypdf.pdfa import file_claims_pdfa, encode_pdf_date, decode_pdf_date
 from ocrmypdf.exceptions import ExitCode
@@ -150,6 +151,6 @@ def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,
     # We expect that the modified date is quite recent
     date_after = decode_pdf_date(after['/ModDate'])
     assert seconds_between_dates(
-        date_after, datetime.datetime.utcnow()) < 1000
+        date_after, datetime.datetime.now(timezone.utc)) < 1000
 
 

From 0b10db91beceb5056d28b756064a5a82c3cd3502 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Tue, 17 Apr 2018 17:00:24 -0700
Subject: [PATCH 9/9] Fix regression: Disable Ghostscript JPEG passthrough
 entirely

---
 src/ocrmypdf/exec/ghostscript.py | 15 ++++++++++++++-
 tests/test_main.py               |  6 +-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/src/ocrmypdf/exec/ghostscript.py b/src/ocrmypdf/exec/ghostscript.py
index e87f7193..b314c65f 100644
--- a/src/ocrmypdf/exec/ghostscript.py
+++ b/src/ocrmypdf/exec/ghostscript.py
@@ -32,6 +32,18 @@ def version():
     return get_version('gs')
 
 
+def jpeg_passthrough_available():
+    """
+    Ghostscript 9.23 introduced JPEG passthrough but it seems to corrupt the
+    last two bytes of certain images, for now we disable it for 9.23 and
+    do not mention it for < 9.23.
+
+    https://bugs.ghostscript.com/show_bug.cgi?id=699216
+
+    """
+    return False
+
+
 def _gs_error_reported(stream):
     return re.search(r'error', stream, flags=re.IGNORECASE)
 
@@ -133,7 +145,8 @@ def generate_pdfa(pdf_pages, output_file, compression, log,
     strategy = 'RGB' if version() >= '9.19' else '/RGB'
 
     if version() == '9.23':
-        # 9.23: JPEG passthrough broken for image masks?
+        # 9.23: new feature JPEG passthrough is broken in some cases, best to
+        # disable it always
         # https://bugs.ghostscript.com/show_bug.cgi?id=699216
         compression_args.append('-dPassThroughJPEGImages=false')
 
diff --git a/tests/test_main.py b/tests/test_main.py
index 3d8b03d6..94ecb466 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -610,10 +610,6 @@ def test_masks(spoof_tesseract_noop, resources, outpdf):
     p, out, err = run_ocrmypdf(
         resources / 'masks.pdf', outpdf, env=spoof_tesseract_noop)
 
-    if ghostscript.version() == '9.23' and \
-            p.returncode == ExitCode.invalid_output_pdf:
-        pytest.xfail('https://bugs.ghostscript.com/show_bug.cgi?id=699216')
-
     assert p.returncode == ExitCode.ok
 
 
@@ -906,7 +902,7 @@ def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,
     if compression == "jpeg":
         assert pdfimage.enc == Encoding.jpeg
     else:
-        if ghostscript.version() >= '9.23':
+        if ghostscript.jpeg_passthrough_available():
             # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be
             # copied without transcoding - so report
             if image.endswith('jpg'):