mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-19 12:04:44 -04:00
Compute image pixel density without performing rectangle intersection (+5 squashed commits)
Squashed commits: [0e27904] Partially implement DPI calculation with rotation of the image Fixes test suite [a64f662] pageinfo: all tests pass [c5b811a] Fix typos [cdd2286] Can now find inline images for efficiently [60dde8d] First cut at implementing intelligent DPI detection based on content stream Broke many of the test cases
This commit is contained in:
@@ -6,7 +6,9 @@ from decimal import Decimal, getcontext
|
||||
import re
|
||||
import sys
|
||||
import PyPDF2 as pypdf
|
||||
from collections import namedtuple
|
||||
|
||||
matrix_mult = pypdf.pdf.utils.matrixMultiply
|
||||
|
||||
FRIENDLY_COLORSPACE = {
|
||||
'/DeviceGray': 'gray',
|
||||
@@ -38,32 +40,66 @@ FRIENDLY_COMP = {
|
||||
}
|
||||
|
||||
|
||||
def _page_has_inline_images(page):
|
||||
# PDF always uses \r\n for separator regardless of platform
|
||||
# Really basic heuristic that might trigger the odd false positive
|
||||
# This is only finds the first image and is not quite spec compliant
|
||||
try:
|
||||
contents = page.getContents()
|
||||
data = contents.getData()
|
||||
except AttributeError:
|
||||
# If we can't access the contents or data (empty page?) then there
|
||||
# are no inline images
|
||||
return False
|
||||
def _matrix_from_shorthand(shorthand):
|
||||
"""Convert from PDF matrix shorthand to full matrix
|
||||
|
||||
begin_image, image_data, end_image = False, False, False
|
||||
for data in re.split(b'\s+', data):
|
||||
if data == b'BI':
|
||||
begin_image = True
|
||||
elif data == b'ID':
|
||||
image_data = True
|
||||
elif data == b'EI':
|
||||
end_image = True
|
||||
if all((begin_image, image_data, end_image)):
|
||||
return True
|
||||
return False
|
||||
PDF 1.7 spec defines a shorthand for describing the entries of a matrix
|
||||
since the last column is always (0, 0, 1).
|
||||
"""
|
||||
|
||||
a, b, c, d, e, f = map(float, shorthand)
|
||||
return ((a, b, 0),
|
||||
(c, d, 0),
|
||||
(e, f, 1))
|
||||
|
||||
|
||||
def _find_page_images(page, pageinfo):
|
||||
def _shorthand_from_matrix(matrix):
|
||||
"""Convert from transformation matrix to PDF shorthand."""
|
||||
a, b = matrix[0][0], matrix[0][1]
|
||||
c, d = matrix[1][0], matrix[1][1]
|
||||
e, f = matrix[2][0], matrix[2][1]
|
||||
return tuple(map(float, (a, b, c, d, e, f)))
|
||||
|
||||
|
||||
def euclidean_distance(rowvec1, rowvec2):
|
||||
return ((rowvec1[0] - rowvec2[0]) ** 2
|
||||
+ (rowvec1[1] - rowvec2[1]) ** 2) ** 0.5
|
||||
|
||||
|
||||
ContentsInfo = namedtuple('ContentsInfo',
|
||||
['raster_settings', 'has_inline_images'])
|
||||
|
||||
def _interpret_contents(contentstream):
|
||||
operations = contentstream.operations
|
||||
stack = []
|
||||
ctm = _matrix_from_shorthand((1, 0, 0, 1, 0, 0))
|
||||
image_raster_settings = []
|
||||
has_inline_images = False
|
||||
|
||||
print(operations)
|
||||
for op in operations:
|
||||
operands, command = op
|
||||
if command == b'q':
|
||||
stack.append(ctm)
|
||||
elif command == b'Q':
|
||||
ctm = stack.pop()
|
||||
elif command == b'cm':
|
||||
ctm = matrix_mult(
|
||||
ctm, _matrix_from_shorthand(operands))
|
||||
elif command == b'Do':
|
||||
image_name = operands[0]
|
||||
image_raster_settings.append(
|
||||
(image_name, _shorthand_from_matrix(ctm)))
|
||||
elif command == b'INLINE IMAGE':
|
||||
# {'settings': {'/BPC': 8, '/H': 8, '/CS': '/RGB', '/F': ['/A85', '/Fl'], '/W': 8}, 'data': b'...'},
|
||||
has_inline_images = True
|
||||
|
||||
return ContentsInfo(
|
||||
raster_settings=image_raster_settings,
|
||||
has_inline_images=has_inline_images)
|
||||
|
||||
|
||||
def _find_page_images(page, pageinfo, contentsinfo):
|
||||
try:
|
||||
page['/Resources']['/XObject']
|
||||
except KeyError:
|
||||
@@ -79,6 +115,7 @@ def _find_page_images(page, pageinfo):
|
||||
if pdfimage['/ImageMask']:
|
||||
continue
|
||||
image = {}
|
||||
image['name'] = str(xobj)
|
||||
image['width'] = pdfimage['/Width']
|
||||
image['height'] = pdfimage['/Height']
|
||||
image['bpc'] = pdfimage['/BitsPerComponent']
|
||||
@@ -98,8 +135,62 @@ def _find_page_images(page, pageinfo):
|
||||
image['color'] = 'jpx' if image['enc'] == 'jpx' else '?'
|
||||
|
||||
image['comp'] = FRIENDLY_COMP.get(image['color'], '?')
|
||||
image['dpi_w'] = image['width'] / pageinfo['width_inches']
|
||||
image['dpi_h'] = image['height'] / pageinfo['height_inches']
|
||||
image['dpi_w'] = image['dpi_h'] = 0
|
||||
|
||||
for raster in contentsinfo.raster_settings:
|
||||
# Loop in case the same image is display multiple times on a page
|
||||
if raster[0] != image['name']:
|
||||
continue
|
||||
shorthand = raster[1]
|
||||
matrix = _matrix_from_shorthand(shorthand)
|
||||
|
||||
# Corners of the image in untranslated square image space; last
|
||||
# column is a dummy
|
||||
corners = [[0, 0, 1],
|
||||
[1, 0, 1],
|
||||
[0, 1, 1],
|
||||
[1, 1, 1]]
|
||||
|
||||
# Rotate/translate/scale the corners into PDF coords (1/72")
|
||||
# ordering of points may change, e.g. if rotation is 180 then
|
||||
# the point (0, 0) may become the top right
|
||||
# The row vectors can all be transformed together here by building
|
||||
# a matrix of them
|
||||
page_unit_corners = matrix_mult(corners, matrix)
|
||||
print(matrix)
|
||||
print(page_unit_corners)
|
||||
|
||||
# Calculate the width and height of the rotated image
|
||||
# the transformation matrix so the corner that was originally
|
||||
# (1, 1) can be ignored
|
||||
image_drawn_width = euclidean_distance(
|
||||
page_unit_corners[0], page_unit_corners[1])
|
||||
image_drawn_height = euclidean_distance(
|
||||
page_unit_corners[0], page_unit_corners[2])
|
||||
|
||||
print((image_drawn_width, image_drawn_height))
|
||||
|
||||
# The scale of the image is pixels per PDF unit (1/72")
|
||||
scale_w = image['width'] / image_drawn_width
|
||||
scale_h = image['height'] / image_drawn_height
|
||||
|
||||
# DPI = scale * 72
|
||||
dpi_w = scale_w * 72.0
|
||||
dpi_h = scale_h * 72.0
|
||||
|
||||
print((dpi_w, dpi_h))
|
||||
|
||||
# If the image is drawn skewed or rotated analyzing its actual
|
||||
# bounding box is a bit more of headache. This is allowed, but
|
||||
# rare.
|
||||
if shorthand[1] != 0 or shorthand[2] != 0:
|
||||
print('image was rotated')
|
||||
|
||||
# When image is used multiple times take the highest DPI it is
|
||||
# rendered at
|
||||
image['dpi_w'] = Decimal(max(dpi_w, image.get('dpi_w', 0)))
|
||||
image['dpi_h'] = Decimal(max(dpi_h, image.get('dpi_h', 0)))
|
||||
|
||||
image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** Decimal(0.5)
|
||||
yield image
|
||||
|
||||
@@ -141,10 +232,19 @@ def _pdf_get_pageinfo(infile, pageno: int):
|
||||
pageinfo['width_inches'] = width_pt / Decimal(72.0)
|
||||
pageinfo['height_inches'] = height_pt / Decimal(72.0)
|
||||
|
||||
pageinfo['images'] = [im for im in _find_page_images(page, pageinfo)]
|
||||
try:
|
||||
contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
|
||||
except AttributeError as e:
|
||||
return pageinfo
|
||||
|
||||
contentsinfo = _interpret_contents(contentstream)
|
||||
|
||||
|
||||
pageinfo['images'] = [im for im in _find_page_images(
|
||||
page, pageinfo, contentsinfo)]
|
||||
|
||||
# Look for inline images
|
||||
if _page_has_inline_images(page):
|
||||
if contentsinfo.has_inline_images:
|
||||
raise NotImplementedError(
|
||||
"Warning: input PDF contains inline images - not supported")
|
||||
|
||||
|
||||
@@ -103,8 +103,8 @@ def test_single_page_image():
|
||||
assert pdfimage['bpc'] == 8
|
||||
|
||||
# DPI in a 1"x1" is the image width
|
||||
assert pdfimage['dpi_w'] == 8
|
||||
assert pdfimage['dpi_h'] == 8
|
||||
assert abs(pdfimage['dpi_w'] - 8) < 1e-5
|
||||
assert abs(pdfimage['dpi_h'] - 8) < 1e-5
|
||||
|
||||
|
||||
def test_single_page_inline_image():
|
||||
@@ -131,4 +131,5 @@ def test_jpeg():
|
||||
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
assert pdfimage['enc'] == 'jpeg'
|
||||
assert (pdfimage['dpi_w'] - 150) < 1e-5
|
||||
|
||||
|
||||
Reference in New Issue
Block a user