From 3957a0606c48ebe7ca760c3255f81c7afb2b63bb Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Fri, 26 Feb 2016 18:19:39 -0800
Subject: [PATCH] Compute image pixel density without performing rectangle
 intersection (+5 squashed commits) Squashed commits: [0e27904] Partially
 implement DPI calculation with rotation of the image

Fixes test suite
[a64f662] pageinfo: all tests pass
[c5b811a] Fix typos
[cdd2286] Can now find inline images for efficiently
[60dde8d] First cut at implementing intelligent DPI detection based on content stream

Broke many of the test cases
---
 ocrmypdf/pageinfo.py   | 154 +++++++++++++++++++++++++++++++++--------
 tests/test_pageinfo.py |   5 +-
 2 files changed, 130 insertions(+), 29 deletions(-)

diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py
index 340d9f22..b9f93dc0 100644
--- a/ocrmypdf/pageinfo.py
+++ b/ocrmypdf/pageinfo.py
@@ -6,7 +6,9 @@ from decimal import Decimal, getcontext
 import re
 import sys
 import PyPDF2 as pypdf
+from collections import namedtuple
 
+matrix_mult = pypdf.pdf.utils.matrixMultiply
 
 FRIENDLY_COLORSPACE = {
     '/DeviceGray': 'gray',
@@ -38,32 +40,66 @@ FRIENDLY_COMP = {
 }
 
 
-def _page_has_inline_images(page):
-    # PDF always uses \r\n for separator regardless of platform
-    # Really basic heuristic that might trigger the odd false positive
-    # This is only finds the first image and is not quite spec compliant
-    try:
-        contents = page.getContents()
-        data = contents.getData()
-    except AttributeError:
-        # If we can't access the contents or data (empty page?) then there
-        # are no inline images
-        return False
+def _matrix_from_shorthand(shorthand):
+    """Convert from PDF matrix shorthand to full matrix
 
-    begin_image, image_data, end_image = False, False, False
-    for data in re.split(b'\s+', data):
-        if data == b'BI':
-            begin_image = True
-        elif data == b'ID':
-            image_data = True
-        elif data == b'EI':
-            end_image = True
-        if all((begin_image, image_data, end_image)):
-            return True
-    return False
+    PDF 1.7 spec defines a shorthand for describing the entries of a matrix
+    since the last column is always (0, 0, 1).
+    """
+
+    a, b, c, d, e, f = map(float, shorthand)
+    return ((a, b, 0),
+            (c, d, 0),
+            (e, f, 1))
 
 
-def _find_page_images(page, pageinfo):
+def _shorthand_from_matrix(matrix):
+    """Convert from transformation matrix to PDF shorthand."""
+    a, b = matrix[0][0], matrix[0][1]
+    c, d = matrix[1][0], matrix[1][1]
+    e, f = matrix[2][0], matrix[2][1]
+    return tuple(map(float, (a, b, c, d, e, f)))
+
+
+def euclidean_distance(rowvec1, rowvec2):
+    return ((rowvec1[0] - rowvec2[0]) ** 2
+        + (rowvec1[1] - rowvec2[1]) ** 2) ** 0.5
+
+
+ContentsInfo = namedtuple('ContentsInfo',
+    ['raster_settings', 'has_inline_images'])
+
+def _interpret_contents(contentstream):
+    operations = contentstream.operations
+    stack = []
+    ctm = _matrix_from_shorthand((1, 0, 0, 1, 0, 0))
+    image_raster_settings = []
+    has_inline_images = False
+
+    print(operations)
+    for op in operations:
+        operands, command = op
+        if command == b'q':
+            stack.append(ctm)
+        elif command == b'Q':
+            ctm = stack.pop()
+        elif command == b'cm':
+            ctm = matrix_mult(
+                ctm, _matrix_from_shorthand(operands))
+        elif command == b'Do':
+            image_name = operands[0]
+            image_raster_settings.append(
+                (image_name, _shorthand_from_matrix(ctm)))
+        elif command == b'INLINE IMAGE':
+            # {'settings': {'/BPC': 8, '/H': 8, '/CS': '/RGB', '/F': ['/A85', '/Fl'], '/W': 8}, 'data': b'...'},
+            has_inline_images = True
+
+    return ContentsInfo(
+        raster_settings=image_raster_settings,
+        has_inline_images=has_inline_images)
+
+
+def _find_page_images(page, pageinfo, contentsinfo):
     try:
         page['/Resources']['/XObject']
     except KeyError:
@@ -79,6 +115,7 @@ def _find_page_images(page, pageinfo):
             if pdfimage['/ImageMask']:
                 continue
         image = {}
+        image['name'] = str(xobj)
         image['width'] = pdfimage['/Width']
         image['height'] = pdfimage['/Height']
         image['bpc'] = pdfimage['/BitsPerComponent']
@@ -98,8 +135,62 @@ def _find_page_images(page, pageinfo):
             image['color'] = 'jpx' if image['enc'] == 'jpx' else '?'
 
         image['comp'] = FRIENDLY_COMP.get(image['color'], '?')
-        image['dpi_w'] = image['width'] / pageinfo['width_inches']
-        image['dpi_h'] = image['height'] / pageinfo['height_inches']
+        image['dpi_w'] = image['dpi_h'] = 0
+
+        for raster in contentsinfo.raster_settings:
+            # Loop in case the same image is display multiple times on a page
+            if raster[0] != image['name']:
+                continue
+            shorthand = raster[1]
+            matrix = _matrix_from_shorthand(shorthand)
+
+            # Corners of the image in untranslated square image space; last
+            # column is a dummy
+            corners = [[0, 0, 1],
+                       [1, 0, 1],
+                       [0, 1, 1],
+                       [1, 1, 1]]
+
+            # Rotate/translate/scale the corners into PDF coords (1/72")
+            # ordering of points may change, e.g. if rotation is 180 then
+            # the point (0, 0) may become the top right
+            # The row vectors can all be transformed together here by building
+            # a matrix of them
+            page_unit_corners = matrix_mult(corners, matrix)
+            print(matrix)
+            print(page_unit_corners)
+
+            # Calculate the width and height of the rotated image
+            # the transformation matrix so the corner that was originally
+            # (1, 1) can be ignored
+            image_drawn_width = euclidean_distance(
+                page_unit_corners[0], page_unit_corners[1])
+            image_drawn_height = euclidean_distance(
+                page_unit_corners[0], page_unit_corners[2])
+
+            print((image_drawn_width, image_drawn_height))
+
+            # The scale of the image is pixels per PDF unit (1/72")
+            scale_w = image['width'] / image_drawn_width
+            scale_h = image['height'] / image_drawn_height
+
+            # DPI = scale * 72
+            dpi_w = scale_w * 72.0
+            dpi_h = scale_h * 72.0
+
+            print((dpi_w, dpi_h))
+
+            # If the image is drawn skewed or rotated analyzing its actual
+            # bounding box is a bit more of headache. This is allowed, but
+            # rare.
+            if shorthand[1] != 0 or shorthand[2] != 0:
+                print('image was rotated')
+
+            # When image is used multiple times take the highest DPI it is
+            # rendered at
+            image['dpi_w'] = Decimal(max(dpi_w, image.get('dpi_w', 0)))
+            image['dpi_h'] = Decimal(max(dpi_h, image.get('dpi_h', 0)))
+
         image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** Decimal(0.5)
         yield image
 
@@ -141,10 +232,19 @@ def _pdf_get_pageinfo(infile, pageno: int):
     pageinfo['width_inches'] = width_pt / Decimal(72.0)
     pageinfo['height_inches'] = height_pt / Decimal(72.0)
 
-    pageinfo['images'] = [im for im in _find_page_images(page, pageinfo)]
+    try:
+        contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
+    except AttributeError as e:
+        return pageinfo
+
+    contentsinfo = _interpret_contents(contentstream)
+
+
+    pageinfo['images'] = [im for im in _find_page_images(
+                                page, pageinfo, contentsinfo)]
 
     # Look for inline images
-    if _page_has_inline_images(page):
+    if contentsinfo.has_inline_images:
         raise NotImplementedError(
             "Warning: input PDF contains inline images - not supported")
 
diff --git a/tests/test_pageinfo.py b/tests/test_pageinfo.py
index 8968c322..7615a268 100644
--- a/tests/test_pageinfo.py
+++ b/tests/test_pageinfo.py
@@ -103,8 +103,8 @@ def test_single_page_image():
     assert pdfimage['bpc'] == 8
 
     # DPI in a 1"x1" is the image width
-    assert pdfimage['dpi_w'] == 8
-    assert pdfimage['dpi_h'] == 8
+    assert abs(pdfimage['dpi_w'] - 8) < 1e-5
+    assert abs(pdfimage['dpi_h'] - 8) < 1e-5
 
 
 def test_single_page_inline_image():
@@ -131,4 +131,5 @@ def test_jpeg():
 
     pdfimage = pdfinfo[0]['images'][0]
     assert pdfimage['enc'] == 'jpeg'
+    assert (pdfimage['dpi_w'] - 150) < 1e-5