Access PageInfo instance variables instead of dictionary

This commit is contained in:
James R. Barlow
2017-05-18 17:12:04 -07:00
parent 6c12e7e944
commit caee5b1428
4 changed files with 60 additions and 53 deletions

View File

@@ -551,6 +551,14 @@ class PageInfo(MutableMapping):
def height_inches(self):
return self._pageinfo['height_inches']
@property
def width_pixels(self):
return int(round(self.width_inches * self.xres))
@property
def height_pixels(self):
return int(round(self.height_inches * self.yres))
@property
def rotation(self):
return self._pageinfo['rotate']
@@ -594,7 +602,6 @@ class PdfInfo:
return self._pages[index]
def __getitem__(self, item):
warnings.warn("pageinfo[item] is deprecated", DeprecationWarning)
return self._pages[item]
def __len__(self):

View File

@@ -210,9 +210,9 @@ def get_page_square_dpi(pageinfo, options):
def is_ocr_required(pageinfo, log, options):
page = pageinfo['pageno'] + 1
page = pageinfo.pageno + 1
ocr_required = True
if not pageinfo['images']:
if not pageinfo.images:
if options.force_ocr and options.oversample:
# The user really wants to reprocess this file
log.info(
@@ -234,7 +234,7 @@ def is_ocr_required(pageinfo, log, options):
"skipping all processing on this page".format(page))
ocr_required = False
elif pageinfo['has_text']:
elif pageinfo.has_text:
msg = "{0:4d}: page already has text! {1}"
if not options.force_ocr and not options.skip_text:
@@ -250,8 +250,8 @@ def is_ocr_required(pageinfo, log, options):
"skipping all processing on this page"))
ocr_required = False
if ocr_required and options.skip_big and pageinfo['images']:
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
if ocr_required and options.skip_big and pageinfo.images:
pixel_count = pageinfo.width_pixels * pageinfo.height_pixels
if pixel_count > (options.skip_big * 1000000):
ocr_required = False
log.warning(
@@ -396,15 +396,15 @@ def rasterize_with_ghostscript(
pageinfo = get_pageinfo(input_file, context)
device = 'png16m' # 24-bit
if pageinfo['images']:
if all(image['comp'] == 1 for image in pageinfo['images']):
if all(image['bpc'] == 1 for image in pageinfo['images']):
if pageinfo.images:
if all(image['comp'] == 1 for image in pageinfo.images):
if all(image['bpc'] == 1 for image in pageinfo.images):
device = 'pngmono'
elif all(image['bpc'] > 1 and image['color'] == 'index'
for image in pageinfo['images']):
for image in pageinfo.images):
device = 'png256'
elif all(image['bpc'] > 1 and image['color'] == 'gray'
for image in pageinfo['images']):
for image in pageinfo.images):
device = 'pnggray'
log.debug("Rasterize {0} with {1}".format(
@@ -430,11 +430,11 @@ def preprocess_remove_background(
pageinfo = get_pageinfo(input_file, context)
if any(image['bpc'] > 1 for image in pageinfo['images']):
if any(image['bpc'] > 1 for image in pageinfo.images):
leptonica.remove_background(input_file, output_file)
else:
log.info("{0:4d}: background removal skipped on mono page".format(
pageinfo['pageno']))
pageinfo.pageno))
re_symlink(input_file, output_file, log)
@@ -521,8 +521,8 @@ def select_visible_page_image(
image = next(ii for ii in infiles if ii.endswith(image_suffix))
pageinfo = get_pageinfo(image, context)
if pageinfo['images'] and \
all(im['enc'] == 'jpeg' for im in pageinfo['images']):
if pageinfo.images and \
all(im['enc'] == 'jpeg' for im in pageinfo.images):
log.debug('{:4d}: JPEG input -> JPEG output'.format(
page_number(image)))
# If all images were JPEGs originally, produce a JPEG as output

View File

@@ -232,8 +232,8 @@ def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
pdfinfo = PdfInfo(oversampled_pdf)
print(pdfinfo[0]['xres'])
assert abs(pdfinfo[0]['xres'] - 350) < 1
print(pdfinfo[0].xres)
assert abs(pdfinfo[0].xres - 350) < 1
def test_repeat_ocr(resources, no_outpdf):
@@ -245,7 +245,7 @@ def test_force_ocr(spoof_tesseract_cache, resources, outpdf):
out = check_ocrmypdf(resources / 'graph_ocred.pdf', outpdf, '-f',
env=spoof_tesseract_cache)
pdfinfo = PdfInfo(out)
assert pdfinfo[0]['has_text']
assert pdfinfo[0].has_text
def test_skip_ocr(spoof_tesseract_cache, resources, outpdf):
@@ -359,14 +359,14 @@ def test_ocr_timeout(renderer, resources, outpdf):
out = check_ocrmypdf(resources / 'skew.pdf', outpdf,
'--tesseract-timeout', '1.0')
pdfinfo = PdfInfo(out)
assert not pdfinfo[0]['has_text']
assert not pdfinfo[0].has_text
def test_skip_big(spoof_tesseract_cache, resources, outpdf):
out = check_ocrmypdf(resources / 'enormous.pdf', outpdf,
'--skip-big', '10', env=spoof_tesseract_cache)
pdfinfo = PdfInfo(out)
assert not pdfinfo[0]['has_text']
assert not pdfinfo[0].has_text
@pytest.mark.parametrize('renderer', ['hocr', 'tesseract'])
@@ -563,7 +563,7 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache,
resources, outpdf):
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
assert in_pageinfo[0].xres != in_pageinfo[0].yres
check_ocrmypdf(
resources / 'aspect.pdf', outpdf,
@@ -572,8 +572,8 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache,
out_pageinfo = PdfInfo(outpdf)
# Confirm resolution was kept the same
assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
assert in_pageinfo[0].xres == out_pageinfo[0].xres
assert in_pageinfo[0].yres == out_pageinfo[0].yres
@pytest.mark.parametrize('renderer', [
@@ -586,7 +586,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
assert in_pageinfo[0].xres != in_pageinfo[0].yres
# --force-ocr requires means forced conversion to square resolution
check_ocrmypdf(
@@ -599,20 +599,20 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]
# Resolution show now be equal
assert out_p0['xres'] == out_p0['yres']
assert out_p0.xres == out_p0.yres
# Page size should match input page size
assert isclose(in_p0['width_inches'],
out_p0['width_inches'])
assert isclose(in_p0['height_inches'],
out_p0['height_inches'])
assert isclose(in_p0.width_inches,
out_p0.width_inches)
assert isclose(in_p0.height_inches,
out_p0.height_inches)
# Because we rasterized the page to produce a new image, it should occupy
# the entire page
out_im_w = out_p0['images'][0]['width'] / out_p0['images'][0]['dpi_w']
out_im_h = out_p0['images'][0]['height'] / out_p0['images'][0]['dpi_h']
assert isclose(out_p0['width_inches'], out_im_w)
assert isclose(out_p0['height_inches'], out_im_h)
out_im_w = out_p0.images[0]['width'] / out_p0.images[0]['dpi_w']
out_im_h = out_p0.images[0]['height'] / out_p0.images[0]['dpi_h']
assert isclose(out_p0.width_inches, out_im_w)
assert isclose(out_p0.height_inches, out_im_h)
def test_image_to_pdf(spoof_tesseract_noop, resources, outpdf):
@@ -629,7 +629,7 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
env=spoof_tesseract_cache)
out_pageinfo = PdfInfo(out)
assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
assert out_pageinfo[0].images[0]['enc'] == 'jbig2'
def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
@@ -704,9 +704,9 @@ def test_rotated_skew_timeout(resources, outpdf):
input_file = str(resources / 'rotated_skew.pdf')
in_pageinfo = PdfInfo(input_file)[0]
assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \
assert in_pageinfo.height_pixels < in_pageinfo.width_pixels, \
"Expected the input page to be landscape"
assert in_pageinfo['rotate'] == 90, "Expected a rotated page"
assert in_pageinfo.rotation == 90, "Expected a rotated page"
out = check_ocrmypdf(
input_file, outpdf,
@@ -715,14 +715,14 @@ def test_rotated_skew_timeout(resources, outpdf):
out_pageinfo = PdfInfo(out)[0]
assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \
assert out_pageinfo.height_pixels > out_pageinfo.width_pixels, \
"Expected the output page to be portrait"
assert out_pageinfo['rotate'] == 0, \
assert out_pageinfo.rotation == 0, \
"Expected no page rotation for output"
assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \
in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \
assert in_pageinfo.width_pixels == out_pageinfo.height_pixels and \
in_pageinfo.height_pixels == out_pageinfo.width_pixels, \
"Expected page rotation to be baked in"
@@ -745,7 +745,7 @@ def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf):
env=spoof_tesseract_cache)
pdfinfo = PdfInfo(outpdf)
image = pdfinfo[0]['images'][0]
image = pdfinfo[0].images[0]
assert image['dpi_w'] == image['dpi_h']
assert image['dpi_w'] == 2400
@@ -902,7 +902,7 @@ def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec,
pdfinfo = PdfInfo(output_file)
pdfimage = pdfinfo[0]['images'][0]
pdfimage = pdfinfo[0].images[0]
if input_file.endswith('.png'):
assert pdfimage['enc'] != 'jpeg', \
@@ -947,7 +947,7 @@ def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,
pdfinfo = PdfInfo(output_file)
pdfimage = pdfinfo[0]['images'][0]
pdfimage = pdfinfo[0].images[0]
if compression == 'jpeg':
assert pdfimage['enc'] == 'jpeg'

View File

@@ -31,8 +31,8 @@ def test_single_page_text(outdir):
assert len(pdfinfo) == 1
page = pdfinfo[0]
assert page['has_text']
assert len(page['images']) == 0
assert page.has_text
assert len(page.images) == 0
def test_single_page_image(outdir):
@@ -58,10 +58,10 @@ def test_single_page_image(outdir):
assert len(pdfinfo) == 1
page = pdfinfo[0]
assert not page['has_text']
assert len(page['images']) == 1
assert not page.has_text
assert len(page.images) == 1
pdfimage = page['images'][0]
pdfimage = page.images[0]
assert pdfimage['width'] == 8
assert pdfimage['color'] == 'gray'
@@ -85,7 +85,7 @@ def test_single_page_inline_image(outdir):
pdfinfo = pageinfo.PdfInfo(filename)
print(pdfinfo)
pdfimage = pdfinfo[0]['images'][0]
pdfimage = pdfinfo[0].images[0]
assert (pdfimage['dpi_w'] - 8) < 1e-5
assert pdfimage['color'] != '-'
assert pdfimage['width'] == 8
@@ -96,7 +96,7 @@ def test_jpeg(resources, outdir):
pdfinfo = pageinfo.PdfInfo(filename)
pdfimage = pdfinfo[0]['images'][0]
pdfimage = pdfinfo[0].images[0]
assert pdfimage['enc'] == 'jpeg'
assert (pdfimage['dpi_w'] - 150) < 1e-5
@@ -105,7 +105,7 @@ def test_form_xobject(resources):
filename = resources / 'formxobject.pdf'
pdfinfo = pageinfo.PdfInfo(filename)
pdfimage = pdfinfo[0]['images'][0]
pdfimage = pdfinfo[0].images[0]
assert pdfimage['width'] == 50
@@ -113,5 +113,5 @@ def test_no_contents(resources):
filename = resources / 'no_contents.pdf'
pdfinfo = pageinfo.PdfInfo(filename)
assert len(pdfinfo[0]['images']) == 0
assert pdfinfo[0]['has_text'] == False
assert len(pdfinfo[0].images) == 0
assert pdfinfo[0].has_text == False