Compute image pixel density without performing rectangle intersection (+5 squashed commits)

Squashed commits:
[0e27904] Partially implement DPI calculation with rotation of the image

Fixes test suite
[a64f662] pageinfo: all tests pass
[c5b811a] Fix typos
[cdd2286] Can now find inline images for efficiently
[60dde8d] First cut at implementing intelligent DPI detection based on content stream

Broke many of the test cases
This commit is contained in:
James R. Barlow
2016-02-26 18:19:39 -08:00
parent 11a561dbce
commit 3957a0606c
2 changed files with 130 additions and 29 deletions

View File

@@ -6,7 +6,9 @@ from decimal import Decimal, getcontext
import re
import sys
import PyPDF2 as pypdf
from collections import namedtuple
matrix_mult = pypdf.pdf.utils.matrixMultiply
FRIENDLY_COLORSPACE = {
'/DeviceGray': 'gray',
@@ -38,32 +40,66 @@ FRIENDLY_COMP = {
}
def _page_has_inline_images(page):
# PDF always uses \r\n for separator regardless of platform
# Really basic heuristic that might trigger the odd false positive
# This is only finds the first image and is not quite spec compliant
try:
contents = page.getContents()
data = contents.getData()
except AttributeError:
# If we can't access the contents or data (empty page?) then there
# are no inline images
return False
def _matrix_from_shorthand(shorthand):
"""Convert from PDF matrix shorthand to full matrix
begin_image, image_data, end_image = False, False, False
for data in re.split(b'\s+', data):
if data == b'BI':
begin_image = True
elif data == b'ID':
image_data = True
elif data == b'EI':
end_image = True
if all((begin_image, image_data, end_image)):
return True
return False
PDF 1.7 spec defines a shorthand for describing the entries of a matrix
since the last column is always (0, 0, 1).
"""
a, b, c, d, e, f = map(float, shorthand)
return ((a, b, 0),
(c, d, 0),
(e, f, 1))
def _find_page_images(page, pageinfo):
def _shorthand_from_matrix(matrix):
"""Convert from transformation matrix to PDF shorthand."""
a, b = matrix[0][0], matrix[0][1]
c, d = matrix[1][0], matrix[1][1]
e, f = matrix[2][0], matrix[2][1]
return tuple(map(float, (a, b, c, d, e, f)))
def euclidean_distance(rowvec1, rowvec2):
return ((rowvec1[0] - rowvec2[0]) ** 2
+ (rowvec1[1] - rowvec2[1]) ** 2) ** 0.5
ContentsInfo = namedtuple('ContentsInfo',
['raster_settings', 'has_inline_images'])
def _interpret_contents(contentstream):
operations = contentstream.operations
stack = []
ctm = _matrix_from_shorthand((1, 0, 0, 1, 0, 0))
image_raster_settings = []
has_inline_images = False
print(operations)
for op in operations:
operands, command = op
if command == b'q':
stack.append(ctm)
elif command == b'Q':
ctm = stack.pop()
elif command == b'cm':
ctm = matrix_mult(
ctm, _matrix_from_shorthand(operands))
elif command == b'Do':
image_name = operands[0]
image_raster_settings.append(
(image_name, _shorthand_from_matrix(ctm)))
elif command == b'INLINE IMAGE':
# {'settings': {'/BPC': 8, '/H': 8, '/CS': '/RGB', '/F': ['/A85', '/Fl'], '/W': 8}, 'data': b'...'},
has_inline_images = True
return ContentsInfo(
raster_settings=image_raster_settings,
has_inline_images=has_inline_images)
def _find_page_images(page, pageinfo, contentsinfo):
try:
page['/Resources']['/XObject']
except KeyError:
@@ -79,6 +115,7 @@ def _find_page_images(page, pageinfo):
if pdfimage['/ImageMask']:
continue
image = {}
image['name'] = str(xobj)
image['width'] = pdfimage['/Width']
image['height'] = pdfimage['/Height']
image['bpc'] = pdfimage['/BitsPerComponent']
@@ -98,8 +135,62 @@ def _find_page_images(page, pageinfo):
image['color'] = 'jpx' if image['enc'] == 'jpx' else '?'
image['comp'] = FRIENDLY_COMP.get(image['color'], '?')
image['dpi_w'] = image['width'] / pageinfo['width_inches']
image['dpi_h'] = image['height'] / pageinfo['height_inches']
image['dpi_w'] = image['dpi_h'] = 0
for raster in contentsinfo.raster_settings:
# Loop in case the same image is display multiple times on a page
if raster[0] != image['name']:
continue
shorthand = raster[1]
matrix = _matrix_from_shorthand(shorthand)
# Corners of the image in untranslated square image space; last
# column is a dummy
corners = [[0, 0, 1],
[1, 0, 1],
[0, 1, 1],
[1, 1, 1]]
# Rotate/translate/scale the corners into PDF coords (1/72")
# ordering of points may change, e.g. if rotation is 180 then
# the point (0, 0) may become the top right
# The row vectors can all be transformed together here by building
# a matrix of them
page_unit_corners = matrix_mult(corners, matrix)
print(matrix)
print(page_unit_corners)
# Calculate the width and height of the rotated image
# the transformation matrix so the corner that was originally
# (1, 1) can be ignored
image_drawn_width = euclidean_distance(
page_unit_corners[0], page_unit_corners[1])
image_drawn_height = euclidean_distance(
page_unit_corners[0], page_unit_corners[2])
print((image_drawn_width, image_drawn_height))
# The scale of the image is pixels per PDF unit (1/72")
scale_w = image['width'] / image_drawn_width
scale_h = image['height'] / image_drawn_height
# DPI = scale * 72
dpi_w = scale_w * 72.0
dpi_h = scale_h * 72.0
print((dpi_w, dpi_h))
# If the image is drawn skewed or rotated analyzing its actual
# bounding box is a bit more of headache. This is allowed, but
# rare.
if shorthand[1] != 0 or shorthand[2] != 0:
print('image was rotated')
# When image is used multiple times take the highest DPI it is
# rendered at
image['dpi_w'] = Decimal(max(dpi_w, image.get('dpi_w', 0)))
image['dpi_h'] = Decimal(max(dpi_h, image.get('dpi_h', 0)))
image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** Decimal(0.5)
yield image
@@ -141,10 +232,19 @@ def _pdf_get_pageinfo(infile, pageno: int):
pageinfo['width_inches'] = width_pt / Decimal(72.0)
pageinfo['height_inches'] = height_pt / Decimal(72.0)
pageinfo['images'] = [im for im in _find_page_images(page, pageinfo)]
try:
contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
except AttributeError as e:
return pageinfo
contentsinfo = _interpret_contents(contentstream)
pageinfo['images'] = [im for im in _find_page_images(
page, pageinfo, contentsinfo)]
# Look for inline images
if _page_has_inline_images(page):
if contentsinfo.has_inline_images:
raise NotImplementedError(
"Warning: input PDF contains inline images - not supported")

View File

@@ -103,8 +103,8 @@ def test_single_page_image():
assert pdfimage['bpc'] == 8
# DPI in a 1"x1" is the image width
assert pdfimage['dpi_w'] == 8
assert pdfimage['dpi_h'] == 8
assert abs(pdfimage['dpi_w'] - 8) < 1e-5
assert abs(pdfimage['dpi_h'] - 8) < 1e-5
def test_single_page_inline_image():
@@ -131,4 +131,5 @@ def test_jpeg():
pdfimage = pdfinfo[0]['images'][0]
assert pdfimage['enc'] == 'jpeg'
assert (pdfimage['dpi_w'] - 150) < 1e-5