From 3957a0606c48ebe7ca760c3255f81c7afb2b63bb Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Fri, 26 Feb 2016 18:19:39 -0800 Subject: [PATCH] Compute image pixel density without performing rectangle intersection (+5 squashed commits) Squashed commits: [0e27904] Partially implement DPI calculation with rotation of the image Fixes test suite [a64f662] pageinfo: all tests pass [c5b811a] Fix typos [cdd2286] Can now find inline images for efficiently [60dde8d] First cut at implementing intelligent DPI detection based on content stream Broke many of the test cases --- ocrmypdf/pageinfo.py | 154 +++++++++++++++++++++++++++++++++-------- tests/test_pageinfo.py | 5 +- 2 files changed, 130 insertions(+), 29 deletions(-) diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py index 340d9f22..b9f93dc0 100644 --- a/ocrmypdf/pageinfo.py +++ b/ocrmypdf/pageinfo.py @@ -6,7 +6,9 @@ from decimal import Decimal, getcontext import re import sys import PyPDF2 as pypdf +from collections import namedtuple +matrix_mult = pypdf.pdf.utils.matrixMultiply FRIENDLY_COLORSPACE = { '/DeviceGray': 'gray', @@ -38,32 +40,66 @@ FRIENDLY_COMP = { } -def _page_has_inline_images(page): - # PDF always uses \r\n for separator regardless of platform - # Really basic heuristic that might trigger the odd false positive - # This is only finds the first image and is not quite spec compliant - try: - contents = page.getContents() - data = contents.getData() - except AttributeError: - # If we can't access the contents or data (empty page?) then there - # are no inline images - return False +def _matrix_from_shorthand(shorthand): + """Convert from PDF matrix shorthand to full matrix - begin_image, image_data, end_image = False, False, False - for data in re.split(b'\s+', data): - if data == b'BI': - begin_image = True - elif data == b'ID': - image_data = True - elif data == b'EI': - end_image = True - if all((begin_image, image_data, end_image)): - return True - return False + PDF 1.7 spec defines a shorthand for describing the entries of a matrix + since the last column is always (0, 0, 1). + """ + + a, b, c, d, e, f = map(float, shorthand) + return ((a, b, 0), + (c, d, 0), + (e, f, 1)) -def _find_page_images(page, pageinfo): +def _shorthand_from_matrix(matrix): + """Convert from transformation matrix to PDF shorthand.""" + a, b = matrix[0][0], matrix[0][1] + c, d = matrix[1][0], matrix[1][1] + e, f = matrix[2][0], matrix[2][1] + return tuple(map(float, (a, b, c, d, e, f))) + + +def euclidean_distance(rowvec1, rowvec2): + return ((rowvec1[0] - rowvec2[0]) ** 2 + + (rowvec1[1] - rowvec2[1]) ** 2) ** 0.5 + + +ContentsInfo = namedtuple('ContentsInfo', + ['raster_settings', 'has_inline_images']) + +def _interpret_contents(contentstream): + operations = contentstream.operations + stack = [] + ctm = _matrix_from_shorthand((1, 0, 0, 1, 0, 0)) + image_raster_settings = [] + has_inline_images = False + + print(operations) + for op in operations: + operands, command = op + if command == b'q': + stack.append(ctm) + elif command == b'Q': + ctm = stack.pop() + elif command == b'cm': + ctm = matrix_mult( + ctm, _matrix_from_shorthand(operands)) + elif command == b'Do': + image_name = operands[0] + image_raster_settings.append( + (image_name, _shorthand_from_matrix(ctm))) + elif command == b'INLINE IMAGE': + # {'settings': {'/BPC': 8, '/H': 8, '/CS': '/RGB', '/F': ['/A85', '/Fl'], '/W': 8}, 'data': b'...'}, + has_inline_images = True + + return ContentsInfo( + raster_settings=image_raster_settings, + has_inline_images=has_inline_images) + + +def _find_page_images(page, pageinfo, contentsinfo): try: page['/Resources']['/XObject'] except KeyError: @@ -79,6 +115,7 @@ def _find_page_images(page, pageinfo): if pdfimage['/ImageMask']: continue image = {} + image['name'] = str(xobj) image['width'] = pdfimage['/Width'] image['height'] = pdfimage['/Height'] image['bpc'] = pdfimage['/BitsPerComponent'] @@ -98,8 +135,62 @@ def _find_page_images(page, pageinfo): image['color'] = 'jpx' if image['enc'] == 'jpx' else '?' image['comp'] = FRIENDLY_COMP.get(image['color'], '?') - image['dpi_w'] = image['width'] / pageinfo['width_inches'] - image['dpi_h'] = image['height'] / pageinfo['height_inches'] + image['dpi_w'] = image['dpi_h'] = 0 + + for raster in contentsinfo.raster_settings: + # Loop in case the same image is display multiple times on a page + if raster[0] != image['name']: + continue + shorthand = raster[1] + matrix = _matrix_from_shorthand(shorthand) + + # Corners of the image in untranslated square image space; last + # column is a dummy + corners = [[0, 0, 1], + [1, 0, 1], + [0, 1, 1], + [1, 1, 1]] + + # Rotate/translate/scale the corners into PDF coords (1/72") + # ordering of points may change, e.g. if rotation is 180 then + # the point (0, 0) may become the top right + # The row vectors can all be transformed together here by building + # a matrix of them + page_unit_corners = matrix_mult(corners, matrix) + print(matrix) + print(page_unit_corners) + + # Calculate the width and height of the rotated image + # the transformation matrix so the corner that was originally + # (1, 1) can be ignored + image_drawn_width = euclidean_distance( + page_unit_corners[0], page_unit_corners[1]) + image_drawn_height = euclidean_distance( + page_unit_corners[0], page_unit_corners[2]) + + print((image_drawn_width, image_drawn_height)) + + # The scale of the image is pixels per PDF unit (1/72") + scale_w = image['width'] / image_drawn_width + scale_h = image['height'] / image_drawn_height + + # DPI = scale * 72 + dpi_w = scale_w * 72.0 + dpi_h = scale_h * 72.0 + + print((dpi_w, dpi_h)) + + # If the image is drawn skewed or rotated analyzing its actual + # bounding box is a bit more of headache. This is allowed, but + # rare. + if shorthand[1] != 0 or shorthand[2] != 0: + print('image was rotated') + + # When image is used multiple times take the highest DPI it is + # rendered at + image['dpi_w'] = Decimal(max(dpi_w, image.get('dpi_w', 0))) + image['dpi_h'] = Decimal(max(dpi_h, image.get('dpi_h', 0))) + image['dpi'] = (image['dpi_w'] * image['dpi_h']) ** Decimal(0.5) yield image @@ -141,10 +232,19 @@ def _pdf_get_pageinfo(infile, pageno: int): pageinfo['width_inches'] = width_pt / Decimal(72.0) pageinfo['height_inches'] = height_pt / Decimal(72.0) - pageinfo['images'] = [im for im in _find_page_images(page, pageinfo)] + try: + contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf) + except AttributeError as e: + return pageinfo + + contentsinfo = _interpret_contents(contentstream) + + + pageinfo['images'] = [im for im in _find_page_images( + page, pageinfo, contentsinfo)] # Look for inline images - if _page_has_inline_images(page): + if contentsinfo.has_inline_images: raise NotImplementedError( "Warning: input PDF contains inline images - not supported") diff --git a/tests/test_pageinfo.py b/tests/test_pageinfo.py index 8968c322..7615a268 100644 --- a/tests/test_pageinfo.py +++ b/tests/test_pageinfo.py @@ -103,8 +103,8 @@ def test_single_page_image(): assert pdfimage['bpc'] == 8 # DPI in a 1"x1" is the image width - assert pdfimage['dpi_w'] == 8 - assert pdfimage['dpi_h'] == 8 + assert abs(pdfimage['dpi_w'] - 8) < 1e-5 + assert abs(pdfimage['dpi_h'] - 8) < 1e-5 def test_single_page_inline_image(): @@ -131,4 +131,5 @@ def test_jpeg(): pdfimage = pdfinfo[0]['images'][0] assert pdfimage['enc'] == 'jpeg' + assert (pdfimage['dpi_w'] - 150) < 1e-5