Files
OCRmyPDF/src/ocrmypdf/pdfinfo.py
2018-07-04 00:59:32 -07:00

767 lines
24 KiB
Python

#!/usr/bin/env python3
# © 2015 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from decimal import Decimal
from math import hypot, isclose
import re
from collections import namedtuple
from pathlib import Path
from enum import Enum
from .exec import ghostscript
from .helpers import fspath
from pikepdf import PdfMatrix
import pikepdf
Colorspace = Enum('Colorspace',
'gray rgb cmyk lab icc index sep devn pattern jpeg2000')
Encoding = Enum('Encoding',
'ccitt jpeg jpeg2000 jbig2 asciihex ascii85 lzw flate ' + \
'runlength')
# Forgive me for I have sinned
# I am using regular expressions to parse XML. However the XML in this case,
# generated by Ghostscript, is self-consistent enough to be parseable.
regex_remove_char_tags = re.compile(br"""
<char\b
(?: [^>] # anything single character but >
| \">\" # special case: trap ">"
)*
/> # terminate with '/>'
""", re.VERBOSE)
FRIENDLY_COLORSPACE = {
'/DeviceGray': Colorspace.gray,
'/CalGray': Colorspace.gray,
'/DeviceRGB': Colorspace.rgb,
'/CalRGB': Colorspace.rgb,
'/DeviceCMYK': Colorspace.cmyk,
'/Lab': Colorspace.lab,
'/ICCBased': Colorspace.icc,
'/Indexed': Colorspace.index,
'/Separation': Colorspace.sep,
'/DeviceN': Colorspace.devn,
'/Pattern': Colorspace.pattern,
'/G': Colorspace.gray, # Abbreviations permitted in inline images
'/RGB': Colorspace.rgb,
'/CMYK': Colorspace.cmyk,
'/I': Colorspace.index,
}
FRIENDLY_ENCODING = {
'/CCITTFaxDecode': Encoding.ccitt,
'/DCTDecode': Encoding.jpeg,
'/JPXDecode': Encoding.jpeg2000,
'/JBIG2Decode': Encoding.jbig2,
'/CCF': Encoding.ccitt, # Abbreviations permitted in inline images
'/DCT': Encoding.jpeg,
'/AHx': Encoding.asciihex,
'/A85': Encoding.ascii85,
'/LZW': Encoding.lzw,
'/Fl': Encoding.flate,
'/RL': Encoding.runlength
}
FRIENDLY_COMP = {
Colorspace.gray: 1,
Colorspace.rgb: 3,
Colorspace.cmyk: 4,
Colorspace.lab: 3,
Colorspace.index: 1
}
UNIT_SQUARE = (1.0, 0.0, 0.0, 1.0, 0.0, 0.0)
def _matrix_from_shorthand(shorthand):
"""Convert from PDF matrix shorthand to full matrix
PDF 1.7 spec defines a shorthand for describing the entries of a matrix
since the last column is always (0, 0, 1).
"""
a, b, c, d, e, f = map(float, shorthand)
return ((a, b, 0),
(c, d, 0),
(e, f, 1))
def _shorthand_from_matrix(matrix):
"""Convert from transformation matrix to PDF shorthand."""
a, b = matrix[0][0], matrix[0][1]
c, d = matrix[1][0], matrix[1][1]
e, f = matrix[2][0], matrix[2][1]
return tuple(map(float, (a, b, c, d, e, f)))
def _is_unit_square(shorthand):
values = map(float, shorthand)
pairwise = zip(values, UNIT_SQUARE)
return all([isclose(a, b, rel_tol=1e-3) for a, b in pairwise])
XobjectSettings = namedtuple('XobjectSettings',
['name', 'shorthand', 'stack_depth'])
InlineSettings = namedtuple('InlineSettings',
['iimage', 'shorthand', 'stack_depth'])
ContentsInfo = namedtuple('ContentsInfo',
['xobject_settings', 'inline_images', 'found_text'])
def _normalize_stack(operations):
"""Convert runs of qQ's in the stack into single operations"""
for operands, command in operations:
command = str(command)
if re.match(r'Q*q+$', command): # Zero or more Q, one or more q
for char in command: # Split into individual
yield ([], char) # Yield individual
else:
yield (operands, command)
def _interpret_contents(contentstream, initial_shorthand=UNIT_SQUARE):
"""Interpret the PDF content stream
The stack represents the state of the PDF graphics stack. We are only
interested in the current transformation matrix (CTM) so we only track
this object; a full implementation would need to track many other items.
The CTM is initialized to the mapping from user space to device space.
PDF units are 1/72". In a PDF viewer or printer this matrix is initialized
to the transformation to device space. For example if set to
(1/72, 0, 0, 1/72, 0, 0) then all units would be calculated in inches.
Images are always considered to be (0, 0) -> (1, 1). Before drawing an
image there should be a 'cm' that sets up an image coordinate system
where drawing from (0, 0) -> (1, 1) will draw on the desired area of the
page.
PDF units suit our needs so we initialize ctm to the identity matrix.
"""
stack = []
ctm = PdfMatrix(initial_shorthand)
xobject_settings = []
inline_images = []
found_text = False
text_operators = set(['Tj', 'TJ', '"', "'"])
operator_whitelist = """q Q Do cm TJ Tj " ' BI ID EI"""
for n, op in enumerate(_normalize_stack(
pikepdf.parse_content_stream(contentstream, operator_whitelist))):
operands, command = op
if command == 'q':
stack.append(ctm)
if len(stack) > 32:
raise RuntimeError(
"PDF graphics stack overflow, command %i" % n)
elif command == 'Q':
try:
ctm = stack.pop()
except IndexError:
raise RuntimeError(
"PDF graphics stack underflow, command %i" % n)
elif command == 'cm':
ctm = PdfMatrix(operands) @ ctm
elif command == 'Do':
image_name = operands[0]
settings = XobjectSettings(
name=image_name, shorthand=ctm.shorthand,
stack_depth=len(stack))
xobject_settings.append(settings)
elif command == 'INLINE IMAGE':
iimage = operands[0]
inline = InlineSettings(
iimage=iimage, shorthand=ctm.shorthand,
stack_depth=len(stack))
inline_images.append(inline)
elif command in text_operators:
found_text = True
return ContentsInfo(
xobject_settings=xobject_settings,
inline_images=inline_images,
found_text=found_text)
def _get_dpi(ctm_shorthand, image_size):
"""Given the transformation matrix and image size, find the image DPI.
PDFs do not include image resolution information within image data.
Instead, the PDF page content stream describes the location where the
image will be rasterized, and the effective resolution is the ratio of the
pixel size to raster target size.
Normally a scanned PDF has the paper size set appropriately but this is
not guaranteed. The most common case is a cropped image will change the
page size (/CropBox) without altering the page content stream. That means
it is not sufficient to assume that the image fills the page, even though
that is the most common case.
A PDF image may be scaled (always), cropped, translated, rotated in place
to an arbitrary angle (rarely) and skewed. Only equal area mappings can
be expressed, that is, it is not necessary to consider distortions where
the effective DPI varies with position.
To determine the image scale, transform an offset axis vector v0 (0, 0),
width-axis vector v0 (1, 0), height-axis vector vh (0, 1) with the matrix,
which gives the dimensions of the image in PDF units. From there we can
compare to actual image dimensions. PDF uses
row vector * matrix_tranposed unlike the traditional
matrix * column vector.
The offset, width and height vectors can be combined in a matrix and
multiplied by the transform matrix. Then we want to calculated
magnitude(width_vector - offset_vector)
and
magnitude(height_vector - offset_vector)
When the above is worked out algebraically, the effect of translation
cancels out, and the vector magnitudes become functions of the nonzero
transformation matrix indices. The results of the derivation are used
in this code.
pdfimages -list does calculate the DPI in some way that is not completely
naive, but it does not get the DPI of rotated images right, so cannot be
used anymore to validate this. Photoshop works, or using Acrobat to
rotate the image back to normal.
It does not matter if the image is partially cropped, or even out of the
/MediaBox.
"""
a, b, c, d, _, _ = ctm_shorthand
# Calculate the width and height of the image in PDF units
image_drawn_width = hypot(a, b)
image_drawn_height = hypot(c, d)
# The scale of the image is pixels per unit of default user space (1/72")
scale_w = image_size[0] / image_drawn_width
scale_h = image_size[1] / image_drawn_height
# DPI = scale * 72
dpi_w = scale_w * 72.0
dpi_h = scale_h * 72.0
return dpi_w, dpi_h
class ImageInfo:
DPI_PREC = Decimal('1.000')
def __init__(self, *, name='', pdfimage=None, inline=None,
shorthand=None):
self._name = str(name)
self._shorthand = shorthand
if inline is not None:
self._origin = 'inline'
pim = inline.iimage
elif pdfimage is not None:
self._origin = 'xobject'
pim = pikepdf.PdfImage(pdfimage)
self._width = pim.width
self._height = pim.height
# If /ImageMask is true, then this image is a stencil mask
# (Images that draw with this stencil mask will have a reference to
# it in their /Mask, but we don't actually need that information)
if pim.image_mask:
self._type = 'stencil'
else:
self._type = 'image'
self._bpc = int(pim.bits_per_component)
try:
self._enc = FRIENDLY_ENCODING.get(pim.filters[0], 'image')
except IndexError:
self._enc = '?'
try:
self._color = FRIENDLY_COLORSPACE.get(pim.colorspace, '?')
except NotImplementedError:
self._color = '?'
if self._enc == Encoding.jpeg2000:
self._color = Colorspace.jpeg2000
self._comp = FRIENDLY_COMP.get(self._color, '?')
# Bit of a hack... infer grayscale if component count is uncertain
# but encoding must be monochrome. This happens if a monochrome image
# has an ICC profile attached. Better solution would be to examine
# the ICC profile.
if self._comp == '?' and self._enc in (Encoding.ccitt, 'jbig2'):
self._comp = FRIENDLY_COMP[Colorspace.gray]
@property
def name(self):
return self._name
@property
def type_(self):
return self._type
@property
def width(self):
return self._width
@property
def height(self):
return self._height
@property
def bpc(self):
return self._bpc
@property
def color(self):
return self._color
@property
def comp(self):
return self._comp
@property
def enc(self):
return self._enc
@property
def xres(self):
return _get_dpi(self._shorthand, (self._width, self._height))[0]
@property
def yres(self):
return _get_dpi(self._shorthand, (self._width, self._height))[1]
def __repr__(self):
class_locals = {attr: getattr(self, attr, None) for attr in dir(self)
if not attr.startswith('_')}
return (
"<ImageInfo '{name}' {type_} {width}x{height} {color} "
"{comp} {bpc} {enc} {xres}x{yres}>").format(**class_locals)
def _find_inline_images(contentsinfo):
"Find inline images in the contentstream"
for n, inline in enumerate(contentsinfo.inline_images):
yield ImageInfo(name='inline-%02d' % n, shorthand=inline.shorthand,
inline=inline)
def _image_xobjects(container):
"""Search for all XObject-based images in the container
Usually the container is a page, but it could also be a Form XObject
that contains images. Filter out the Form XObjects which are dealt with
elsewhere.
Generate a sequence of tuples (image, xobj container), where container,
where xobj is the name of the object and image is the object itself,
since the object does not know its own name.
"""
if '/Resources' not in container:
return
resources = container['/Resources']
if '/XObject' not in resources:
return
xobjs = resources['/XObject'].as_dict()
for xobj in xobjs:
candidate = xobjs[xobj]
if candidate['/Subtype'] == '/Image':
pdfimage = candidate
yield (pdfimage, xobj)
def _find_regular_images(container, contentsinfo):
"""Find images stored in the container's /Resources /XObject
Usually the container is a page, but it could also be a Form XObject
that contains images.
Generates images with their DPI at time of drawing.
"""
for pdfimage, xobj in _image_xobjects(container):
# For each image that is drawn on this, check if we drawing the
# current image - yes this is O(n^2), but n == 1 almost always
for draw in contentsinfo.xobject_settings:
if draw.name != xobj:
continue
if draw.stack_depth == 0 and _is_unit_square(draw.shorthand):
# At least one PDF in the wild (and test suite) draws an image
# when the graphics stack depth is 0, meaning that the image
# gets drawn into a square of 1x1 PDF units (or 1/72",
# or 0.35 mm). The equivalent DPI will be >100,000. Exclude
# these from our DPI calculation for the page.
continue
yield ImageInfo(name=draw.name, pdfimage=pdfimage, shorthand=
draw.shorthand)
def _find_form_xobject_images(pdf, container, contentsinfo):
"""Find any images that are in Form XObjects in the container
The container may be a page, or a parent Form XObject.
"""
if '/Resources' not in container:
return
resources = container['/Resources']
if '/XObject' not in resources:
return
xobjs = resources['/XObject'].as_dict()
for xobj in xobjs:
candidate = xobjs[xobj]
if candidate['/Subtype'] != '/Form':
continue
form_xobject = candidate
for settings in contentsinfo.xobject_settings:
if settings.name != xobj:
continue
# Find images once for each time this Form XObject is drawn.
# This could be optimized to cache the multiple drawing events
# but in practice both Form XObjects and multiple drawing of the
# same object are both very rare.
ctm_shorthand = settings.shorthand
yield from _find_images(
pdf=pdf, container=form_xobject, shorthand=ctm_shorthand)
def _find_images(*, pdf, container, shorthand=None):
"""Find all individual instances of images drawn in the container
Usually the container is a page, but it may also be a Form XObject.
On a typical page images are stored inline or as regular images
in an XObject.
Form XObjects may include inline images, XObject images,
and recursively, other Form XObjects; and also vector drawing commands.
Every instance of an image being drawn somewhere is flattened and
treated as a unique image, since if the same image is drawn multiple times
on one page it may be drawn at differing resolutions, and our objective
is to find the resolution at which the page can be rastered without
downsampling.
"""
if container.get('/Type') == '/Page' and '/Contents' in container:
initial_shorthand = shorthand or UNIT_SQUARE
elif container.get('/Type') == '/XObject' and \
container['/Subtype'] == '/Form':
# Set the CTM to the state it was when the "Do" operator was
# encountered that is drawing this instance of the Form XObject
ctm = PdfMatrix(shorthand) if shorthand else PdfMatrix.identity()
# A Form XObject may provide its own matrix to map form space into
# user space. Get this if one exists
form_shorthand = container.get('/Matrix', PdfMatrix.identity())
form_matrix = PdfMatrix(form_shorthand)
# Concatenate form matrix with CTM to ensure CTM is correct for
# drawing this instance of the XObject
ctm = form_matrix @ ctm
initial_shorthand = ctm.shorthand
else:
return
contentsinfo = _interpret_contents(container, initial_shorthand)
yield from _find_inline_images(contentsinfo)
yield from _find_regular_images(container, contentsinfo)
yield from _find_form_xobject_images(pdf, container, contentsinfo)
def _page_get_textblocks(infile, pageno):
"""Smarter text detection"""
import xml.etree.ElementTree as ET
gstext = ghostscript.extract_text(infile, pageno+1)
# Remove all <char /> tags, because they might contain invalid XML entities
# like <char bbox="348 596 348 596" c="&#x1;"/> which chokes on the
# inclusion of U+0001. Understandably.
# Just remove the whole <char /> tag since we don't use it at all, and they
# are only generated as innermost self-closing tags.
gstext = regex_remove_char_tags.sub(b' ', gstext)
root = ET.fromstring(gstext)
def blocks():
for span in root.findall('.//span'):
bbox_str = span.attrib['bbox']
font_size = span.attrib['size']
pts = [int(pt) for pt in bbox_str.split()]
pts[1] = pts[1] - int(float(font_size) + 0.5)
bbox = tuple(pts)
yield bbox
def joined_blocks():
prev = None
for bbox in blocks():
if prev is None:
prev = bbox
if bbox[1] == prev[1] and bbox[3] == prev[3]:
gap = prev[2] - bbox[0]
height = bbox[3] - bbox[1]
if gap < height:
# Join boxes
prev = (prev[0], prev[1], bbox[2], bbox[3])
continue
# yield previously joined bboxes and start anew
yield prev
prev = bbox
if prev is not None:
yield prev
return [block for block in joined_blocks()]
def _page_has_text(text_blocks, page_width, page_height):
"""Smarter text detection that ignores text in margins"""
pw, ph = float(page_width), float(page_height)
margin_ratio = 0.125
interior_bbox = (
margin_ratio * pw, margin_ratio * ph,
(1 - margin_ratio) * pw, (1 - margin_ratio) * ph
)
def rects_intersect(a, b):
"""
Where (a,b) are 4-tuple rects (left-0, top-1, right-2, bottom-3)
https://stackoverflow.com/questions/306316/determine-if-two-rectangles-overlap-each-other
Negative signs to account for our coordinates being in the fourth quadrant
and the formula assuming the first
"""
return a[0] < b[2] and a[2] > b[0] and -a[1] > -b[3] and -a[3] < -b[1]
has_text = False
for bbox in text_blocks:
if rects_intersect(bbox, interior_bbox):
has_text = True
break
return has_text
def _pdf_get_pageinfo(pdf, pageno: int, infile):
pageinfo = {}
pageinfo['pageno'] = pageno
pageinfo['images'] = []
page = pdf.pages[pageno]
pageinfo['textinfo'] = _page_get_textblocks(fspath(infile), pageno)
mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
width_pt = mediabox[2] - mediabox[0]
height_pt = mediabox[3] - mediabox[1]
pageinfo['has_text'] = _page_has_text(
pageinfo['textinfo'], width_pt, height_pt)
userunit = page.get('/UserUnit', Decimal(1.0))
if not isinstance(userunit, Decimal):
userunit = Decimal(userunit)
pageinfo['userunit'] = userunit
pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0)
pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0)
try:
pageinfo['rotate'] = int(page['/Rotate'])
except KeyError:
pageinfo['rotate'] = 0
userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)
pageinfo['images'] = [im for im in
_find_images(pdf=pdf, container=page,
shorthand=userunit_shorthand)]
if pageinfo['images']:
xres = Decimal(max(image.xres for image in pageinfo['images']))
yres = Decimal(max(image.yres for image in pageinfo['images']))
pageinfo['xres'], pageinfo['yres'] = xres, yres
pageinfo['width_pixels'] = \
int(round(xres * pageinfo['width_inches']))
pageinfo['height_pixels'] = \
int(round(yres * pageinfo['height_inches']))
return pageinfo
def _pdf_get_all_pageinfo(infile):
pdf = pikepdf.open(infile)
return [PageInfo(pdf, n, infile) for n in range(len(pdf.pages))]
class PageInfo:
def __init__(self, pdf, pageno, infile):
self._pageno = pageno
self._infile = infile
self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile)
@property
def pageno(self):
return self._pageno
@property
def has_text(self):
return self._pageinfo['has_text']
@property
def width_inches(self):
return self._pageinfo['width_inches']
@property
def height_inches(self):
return self._pageinfo['height_inches']
@property
def width_pixels(self):
return int(round(self.width_inches * self.xres))
@property
def height_pixels(self):
return int(round(self.height_inches * self.yres))
@property
def rotation(self):
return self._pageinfo.get('rotate', None)
@rotation.setter
def rotation(self, value):
if value in (0, 90, 180, 270, 360, -90, -180, -270):
self._pageinfo['rotate'] = value
else:
raise ValueError("rotation must be a cardinal angle")
@property
def images(self):
return self._pageinfo['images']
def get_textareas(self):
yield from self._pageinfo['textinfo']
@property
def xres(self):
return self._pageinfo.get('xres', None)
@property
def yres(self):
return self._pageinfo.get('yres', None)
@property
def userunit(self):
return self._pageinfo.get('userunit', None)
@property
def min_version(self):
if self.userunit is not None:
return '1.6'
else:
return '1.5'
def __repr__(self):
return (
'<PageInfo '
'pageno={} {}"x{}" rotation={} res={}x{} has_text={}>').format(
self.pageno, self.width_inches, self.height_inches,
self.rotation,
self.xres, self.yres, self.has_text
)
class PdfInfo:
"""Get summary information about a PDF
"""
def __init__(self, infile):
self._infile = infile
self._pages = _pdf_get_all_pageinfo(infile)
@property
def pages(self):
return self._pages
@property
def min_version(self):
# The minimum PDF is the maximum version that any particular page needs
return max(page.min_version for page in self.pages)
@property
def has_userunit(self):
return any(page.userunit != 1.0 for page in self.pages)
@property
def filename(self):
if not isinstance(self._infile, (str, Path)):
raise NotImplementedError("can't get filename from stream")
return self._infile
def __getitem__(self, item):
return self._pages[item]
def __len__(self):
return len(self._pages)
def __repr__(self):
return "<PdfInfo('...'), page count={}>".format(len(self))
# def __getstate__(self):
# state = {'_infile': self._infile}
# return state
#
# def __setstate__(self, state):
# self._infile = state['_infile']
# self._pages = _pdf_get_all_pageinfo(self._infile)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('infile')
args = parser.parse_args()
info = _pdf_get_all_pageinfo(args.infile)
from pprint import pprint
pprint(info)
if __name__ == '__main__':
main()