mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-07 14:14:41 -04:00
Access PageInfo instance variables instead of dictionary
This commit is contained in:
@@ -551,6 +551,14 @@ class PageInfo(MutableMapping):
|
||||
def height_inches(self):
|
||||
return self._pageinfo['height_inches']
|
||||
|
||||
@property
|
||||
def width_pixels(self):
|
||||
return int(round(self.width_inches * self.xres))
|
||||
|
||||
@property
|
||||
def height_pixels(self):
|
||||
return int(round(self.height_inches * self.yres))
|
||||
|
||||
@property
|
||||
def rotation(self):
|
||||
return self._pageinfo['rotate']
|
||||
@@ -594,7 +602,6 @@ class PdfInfo:
|
||||
return self._pages[index]
|
||||
|
||||
def __getitem__(self, item):
|
||||
warnings.warn("pageinfo[item] is deprecated", DeprecationWarning)
|
||||
return self._pages[item]
|
||||
|
||||
def __len__(self):
|
||||
|
||||
@@ -210,9 +210,9 @@ def get_page_square_dpi(pageinfo, options):
|
||||
|
||||
|
||||
def is_ocr_required(pageinfo, log, options):
|
||||
page = pageinfo['pageno'] + 1
|
||||
page = pageinfo.pageno + 1
|
||||
ocr_required = True
|
||||
if not pageinfo['images']:
|
||||
if not pageinfo.images:
|
||||
if options.force_ocr and options.oversample:
|
||||
# The user really wants to reprocess this file
|
||||
log.info(
|
||||
@@ -234,7 +234,7 @@ def is_ocr_required(pageinfo, log, options):
|
||||
"skipping all processing on this page".format(page))
|
||||
ocr_required = False
|
||||
|
||||
elif pageinfo['has_text']:
|
||||
elif pageinfo.has_text:
|
||||
msg = "{0:4d}: page already has text! – {1}"
|
||||
|
||||
if not options.force_ocr and not options.skip_text:
|
||||
@@ -250,8 +250,8 @@ def is_ocr_required(pageinfo, log, options):
|
||||
"skipping all processing on this page"))
|
||||
ocr_required = False
|
||||
|
||||
if ocr_required and options.skip_big and pageinfo['images']:
|
||||
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
|
||||
if ocr_required and options.skip_big and pageinfo.images:
|
||||
pixel_count = pageinfo.width_pixels * pageinfo.height_pixels
|
||||
if pixel_count > (options.skip_big * 1000000):
|
||||
ocr_required = False
|
||||
log.warning(
|
||||
@@ -396,15 +396,15 @@ def rasterize_with_ghostscript(
|
||||
pageinfo = get_pageinfo(input_file, context)
|
||||
|
||||
device = 'png16m' # 24-bit
|
||||
if pageinfo['images']:
|
||||
if all(image['comp'] == 1 for image in pageinfo['images']):
|
||||
if all(image['bpc'] == 1 for image in pageinfo['images']):
|
||||
if pageinfo.images:
|
||||
if all(image['comp'] == 1 for image in pageinfo.images):
|
||||
if all(image['bpc'] == 1 for image in pageinfo.images):
|
||||
device = 'pngmono'
|
||||
elif all(image['bpc'] > 1 and image['color'] == 'index'
|
||||
for image in pageinfo['images']):
|
||||
for image in pageinfo.images):
|
||||
device = 'png256'
|
||||
elif all(image['bpc'] > 1 and image['color'] == 'gray'
|
||||
for image in pageinfo['images']):
|
||||
for image in pageinfo.images):
|
||||
device = 'pnggray'
|
||||
|
||||
log.debug("Rasterize {0} with {1}".format(
|
||||
@@ -430,11 +430,11 @@ def preprocess_remove_background(
|
||||
|
||||
pageinfo = get_pageinfo(input_file, context)
|
||||
|
||||
if any(image['bpc'] > 1 for image in pageinfo['images']):
|
||||
if any(image['bpc'] > 1 for image in pageinfo.images):
|
||||
leptonica.remove_background(input_file, output_file)
|
||||
else:
|
||||
log.info("{0:4d}: background removal skipped on mono page".format(
|
||||
pageinfo['pageno']))
|
||||
pageinfo.pageno))
|
||||
re_symlink(input_file, output_file, log)
|
||||
|
||||
|
||||
@@ -521,8 +521,8 @@ def select_visible_page_image(
|
||||
image = next(ii for ii in infiles if ii.endswith(image_suffix))
|
||||
|
||||
pageinfo = get_pageinfo(image, context)
|
||||
if pageinfo['images'] and \
|
||||
all(im['enc'] == 'jpeg' for im in pageinfo['images']):
|
||||
if pageinfo.images and \
|
||||
all(im['enc'] == 'jpeg' for im in pageinfo.images):
|
||||
log.debug('{:4d}: JPEG input -> JPEG output'.format(
|
||||
page_number(image)))
|
||||
# If all images were JPEGs originally, produce a JPEG as output
|
||||
|
||||
@@ -232,8 +232,8 @@ def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
|
||||
|
||||
pdfinfo = PdfInfo(oversampled_pdf)
|
||||
|
||||
print(pdfinfo[0]['xres'])
|
||||
assert abs(pdfinfo[0]['xres'] - 350) < 1
|
||||
print(pdfinfo[0].xres)
|
||||
assert abs(pdfinfo[0].xres - 350) < 1
|
||||
|
||||
|
||||
def test_repeat_ocr(resources, no_outpdf):
|
||||
@@ -245,7 +245,7 @@ def test_force_ocr(spoof_tesseract_cache, resources, outpdf):
|
||||
out = check_ocrmypdf(resources / 'graph_ocred.pdf', outpdf, '-f',
|
||||
env=spoof_tesseract_cache)
|
||||
pdfinfo = PdfInfo(out)
|
||||
assert pdfinfo[0]['has_text']
|
||||
assert pdfinfo[0].has_text
|
||||
|
||||
|
||||
def test_skip_ocr(spoof_tesseract_cache, resources, outpdf):
|
||||
@@ -359,14 +359,14 @@ def test_ocr_timeout(renderer, resources, outpdf):
|
||||
out = check_ocrmypdf(resources / 'skew.pdf', outpdf,
|
||||
'--tesseract-timeout', '1.0')
|
||||
pdfinfo = PdfInfo(out)
|
||||
assert not pdfinfo[0]['has_text']
|
||||
assert not pdfinfo[0].has_text
|
||||
|
||||
|
||||
def test_skip_big(spoof_tesseract_cache, resources, outpdf):
|
||||
out = check_ocrmypdf(resources / 'enormous.pdf', outpdf,
|
||||
'--skip-big', '10', env=spoof_tesseract_cache)
|
||||
pdfinfo = PdfInfo(out)
|
||||
assert not pdfinfo[0]['has_text']
|
||||
assert not pdfinfo[0].has_text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('renderer', ['hocr', 'tesseract'])
|
||||
@@ -563,7 +563,7 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache,
|
||||
resources, outpdf):
|
||||
# Confirm input image is non-square resolution
|
||||
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
|
||||
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
|
||||
assert in_pageinfo[0].xres != in_pageinfo[0].yres
|
||||
|
||||
check_ocrmypdf(
|
||||
resources / 'aspect.pdf', outpdf,
|
||||
@@ -572,8 +572,8 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache,
|
||||
out_pageinfo = PdfInfo(outpdf)
|
||||
|
||||
# Confirm resolution was kept the same
|
||||
assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
|
||||
assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
|
||||
assert in_pageinfo[0].xres == out_pageinfo[0].xres
|
||||
assert in_pageinfo[0].yres == out_pageinfo[0].yres
|
||||
|
||||
|
||||
@pytest.mark.parametrize('renderer', [
|
||||
@@ -586,7 +586,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
|
||||
|
||||
# Confirm input image is non-square resolution
|
||||
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
|
||||
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
|
||||
assert in_pageinfo[0].xres != in_pageinfo[0].yres
|
||||
|
||||
# --force-ocr requires means forced conversion to square resolution
|
||||
check_ocrmypdf(
|
||||
@@ -599,20 +599,20 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
|
||||
in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]
|
||||
|
||||
# Resolution show now be equal
|
||||
assert out_p0['xres'] == out_p0['yres']
|
||||
assert out_p0.xres == out_p0.yres
|
||||
|
||||
# Page size should match input page size
|
||||
assert isclose(in_p0['width_inches'],
|
||||
out_p0['width_inches'])
|
||||
assert isclose(in_p0['height_inches'],
|
||||
out_p0['height_inches'])
|
||||
assert isclose(in_p0.width_inches,
|
||||
out_p0.width_inches)
|
||||
assert isclose(in_p0.height_inches,
|
||||
out_p0.height_inches)
|
||||
|
||||
# Because we rasterized the page to produce a new image, it should occupy
|
||||
# the entire page
|
||||
out_im_w = out_p0['images'][0]['width'] / out_p0['images'][0]['dpi_w']
|
||||
out_im_h = out_p0['images'][0]['height'] / out_p0['images'][0]['dpi_h']
|
||||
assert isclose(out_p0['width_inches'], out_im_w)
|
||||
assert isclose(out_p0['height_inches'], out_im_h)
|
||||
out_im_w = out_p0.images[0]['width'] / out_p0.images[0]['dpi_w']
|
||||
out_im_h = out_p0.images[0]['height'] / out_p0.images[0]['dpi_h']
|
||||
assert isclose(out_p0.width_inches, out_im_w)
|
||||
assert isclose(out_p0.height_inches, out_im_h)
|
||||
|
||||
|
||||
def test_image_to_pdf(spoof_tesseract_noop, resources, outpdf):
|
||||
@@ -629,7 +629,7 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
|
||||
env=spoof_tesseract_cache)
|
||||
|
||||
out_pageinfo = PdfInfo(out)
|
||||
assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
|
||||
assert out_pageinfo[0].images[0]['enc'] == 'jbig2'
|
||||
|
||||
|
||||
def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
|
||||
@@ -704,9 +704,9 @@ def test_rotated_skew_timeout(resources, outpdf):
|
||||
input_file = str(resources / 'rotated_skew.pdf')
|
||||
in_pageinfo = PdfInfo(input_file)[0]
|
||||
|
||||
assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \
|
||||
assert in_pageinfo.height_pixels < in_pageinfo.width_pixels, \
|
||||
"Expected the input page to be landscape"
|
||||
assert in_pageinfo['rotate'] == 90, "Expected a rotated page"
|
||||
assert in_pageinfo.rotation == 90, "Expected a rotated page"
|
||||
|
||||
out = check_ocrmypdf(
|
||||
input_file, outpdf,
|
||||
@@ -715,14 +715,14 @@ def test_rotated_skew_timeout(resources, outpdf):
|
||||
|
||||
out_pageinfo = PdfInfo(out)[0]
|
||||
|
||||
assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \
|
||||
assert out_pageinfo.height_pixels > out_pageinfo.width_pixels, \
|
||||
"Expected the output page to be portrait"
|
||||
|
||||
assert out_pageinfo['rotate'] == 0, \
|
||||
assert out_pageinfo.rotation == 0, \
|
||||
"Expected no page rotation for output"
|
||||
|
||||
assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \
|
||||
in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \
|
||||
assert in_pageinfo.width_pixels == out_pageinfo.height_pixels and \
|
||||
in_pageinfo.height_pixels == out_pageinfo.width_pixels, \
|
||||
"Expected page rotation to be baked in"
|
||||
|
||||
|
||||
@@ -745,7 +745,7 @@ def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf):
|
||||
env=spoof_tesseract_cache)
|
||||
pdfinfo = PdfInfo(outpdf)
|
||||
|
||||
image = pdfinfo[0]['images'][0]
|
||||
image = pdfinfo[0].images[0]
|
||||
assert image['dpi_w'] == image['dpi_h']
|
||||
assert image['dpi_w'] == 2400
|
||||
|
||||
@@ -902,7 +902,7 @@ def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec,
|
||||
|
||||
pdfinfo = PdfInfo(output_file)
|
||||
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
pdfimage = pdfinfo[0].images[0]
|
||||
|
||||
if input_file.endswith('.png'):
|
||||
assert pdfimage['enc'] != 'jpeg', \
|
||||
@@ -947,7 +947,7 @@ def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,
|
||||
|
||||
pdfinfo = PdfInfo(output_file)
|
||||
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
pdfimage = pdfinfo[0].images[0]
|
||||
|
||||
if compression == 'jpeg':
|
||||
assert pdfimage['enc'] == 'jpeg'
|
||||
|
||||
@@ -31,8 +31,8 @@ def test_single_page_text(outdir):
|
||||
assert len(pdfinfo) == 1
|
||||
page = pdfinfo[0]
|
||||
|
||||
assert page['has_text']
|
||||
assert len(page['images']) == 0
|
||||
assert page.has_text
|
||||
assert len(page.images) == 0
|
||||
|
||||
|
||||
def test_single_page_image(outdir):
|
||||
@@ -58,10 +58,10 @@ def test_single_page_image(outdir):
|
||||
assert len(pdfinfo) == 1
|
||||
page = pdfinfo[0]
|
||||
|
||||
assert not page['has_text']
|
||||
assert len(page['images']) == 1
|
||||
assert not page.has_text
|
||||
assert len(page.images) == 1
|
||||
|
||||
pdfimage = page['images'][0]
|
||||
pdfimage = page.images[0]
|
||||
assert pdfimage['width'] == 8
|
||||
assert pdfimage['color'] == 'gray'
|
||||
|
||||
@@ -85,7 +85,7 @@ def test_single_page_inline_image(outdir):
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
print(pdfinfo)
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
pdfimage = pdfinfo[0].images[0]
|
||||
assert (pdfimage['dpi_w'] - 8) < 1e-5
|
||||
assert pdfimage['color'] != '-'
|
||||
assert pdfimage['width'] == 8
|
||||
@@ -96,7 +96,7 @@ def test_jpeg(resources, outdir):
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
pdfimage = pdfinfo[0].images[0]
|
||||
assert pdfimage['enc'] == 'jpeg'
|
||||
assert (pdfimage['dpi_w'] - 150) < 1e-5
|
||||
|
||||
@@ -105,7 +105,7 @@ def test_form_xobject(resources):
|
||||
filename = resources / 'formxobject.pdf'
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
pdfimage = pdfinfo[0].images[0]
|
||||
assert pdfimage['width'] == 50
|
||||
|
||||
|
||||
@@ -113,5 +113,5 @@ def test_no_contents(resources):
|
||||
filename = resources / 'no_contents.pdf'
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
assert len(pdfinfo[0]['images']) == 0
|
||||
assert pdfinfo[0]['has_text'] == False
|
||||
assert len(pdfinfo[0].images) == 0
|
||||
assert pdfinfo[0].has_text == False
|
||||
Reference in New Issue
Block a user