Refactor PdfInfo(str(filename)) -> PdfInfo(filename)

This commit is contained in:
James R. Barlow
2017-05-18 16:43:50 -07:00
parent 6a0b68298f
commit cd04ae6949
5 changed files with 23 additions and 20 deletions

View File

@@ -9,6 +9,7 @@ import sys
import PyPDF2 as pypdf
from collections import namedtuple
import warnings
from pathlib import Path
matrix_mult = pypdf.pdf.utils.matrixMultiply
@@ -517,6 +518,8 @@ def _pdf_get_pageinfo(infile, pageno: int):
def pdf_get_all_pageinfo(infile):
if isinstance(infile, Path):
infile = str(infile)
pdf = pypdf.PdfFileReader(infile)
return [_pdf_get_pageinfo(infile, n) for n in range(pdf.numPages)]

View File

@@ -130,6 +130,6 @@ def run_ocrmypdf(input_file, output_file, *args, env=None):
@pytest.helpers.register
def first_page_dimensions(pdf):
from ocrmypdf import pageinfo
info = pageinfo.pdf_get_all_pageinfo(str(pdf))
info = pageinfo.PdfInfo(pdf)
page0 = info[0]
return (page0['width_inches'], page0['height_inches'])

View File

@@ -230,7 +230,7 @@ def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
'-f',
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
pdfinfo = PdfInfo(str(oversampled_pdf))
pdfinfo = PdfInfo(oversampled_pdf)
print(pdfinfo[0]['xres'])
assert abs(pdfinfo[0]['xres'] - 350) < 1
@@ -358,14 +358,14 @@ def test_autorotate_threshold(
def test_ocr_timeout(renderer, resources, outpdf):
out = check_ocrmypdf(resources / 'skew.pdf', outpdf,
'--tesseract-timeout', '1.0')
pdfinfo = PdfInfo(str(out))
pdfinfo = PdfInfo(out)
assert not pdfinfo[0]['has_text']
def test_skip_big(spoof_tesseract_cache, resources, outpdf):
out = check_ocrmypdf(resources / 'enormous.pdf', outpdf,
'--skip-big', '10', env=spoof_tesseract_cache)
pdfinfo = PdfInfo(str(out))
pdfinfo = PdfInfo(out)
assert not pdfinfo[0]['has_text']
@@ -562,14 +562,14 @@ def test_algo4(resources, no_outpdf):
def test_non_square_resolution(renderer, spoof_tesseract_cache,
resources, outpdf):
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(str(resources / 'aspect.pdf'))
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
check_ocrmypdf(
resources / 'aspect.pdf', outpdf,
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
out_pageinfo = PdfInfo(str(outpdf))
out_pageinfo = PdfInfo(outpdf)
# Confirm resolution was kept the same
assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
@@ -585,7 +585,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
from math import isclose
# Confirm input image is non-square resolution
in_pageinfo = PdfInfo(str(resources / 'aspect.pdf'))
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
# --force-ocr requires means forced conversion to square resolution
@@ -594,7 +594,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
'--force-ocr',
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
out_pageinfo = PdfInfo(str(outpdf))
out_pageinfo = PdfInfo(outpdf)
in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]
@@ -628,7 +628,7 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
'--pdf-renderer', 'hocr',
env=spoof_tesseract_cache)
out_pageinfo = PdfInfo(str(out))
out_pageinfo = PdfInfo(out)
assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
@@ -713,7 +713,7 @@ def test_rotated_skew_timeout(resources, outpdf):
'--pdf-renderer', 'hocr',
'--deskew', '--tesseract-timeout', '0')
out_pageinfo = PdfInfo(str(out))[0]
out_pageinfo = PdfInfo(out)[0]
assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \
"Expected the output page to be portrait"
@@ -970,7 +970,7 @@ def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf):
'--sidecar', sidecar,
env=spoof_tesseract_cache)
pdfinfo = PdfInfo(str(resources / 'multipage.pdf'))
pdfinfo = PdfInfo(resources / 'multipage.pdf')
num_pages = len(pdfinfo)
with open(sidecar, 'r') as f:

View File

@@ -26,7 +26,7 @@ def test_single_page_text(outdir):
pdf.showPage()
pdf.save()
pdfinfo = pageinfo.PdfInfo(str(filename))
pdfinfo = pageinfo.PdfInfo(filename)
assert len(pdfinfo) == 1
page = pdfinfo[0]
@@ -53,7 +53,7 @@ def test_single_page_image(outdir):
layout_fun=layout_fun)
filename.write_bytes(pdf_bytes)
pdfinfo = pageinfo.PdfInfo(str(filename))
pdfinfo = pageinfo.PdfInfo(filename)
assert len(pdfinfo) == 1
page = pdfinfo[0]
@@ -83,7 +83,7 @@ def test_single_page_inline_image(outdir):
pdf.showPage()
pdf.save()
pdfinfo = pageinfo.PdfInfo(str(filename))
pdfinfo = pageinfo.PdfInfo(filename)
print(pdfinfo)
pdfimage = pdfinfo[0]['images'][0]
assert (pdfimage['dpi_w'] - 8) < 1e-5
@@ -94,7 +94,7 @@ def test_single_page_inline_image(outdir):
def test_jpeg(resources, outdir):
filename = resources / 'c02-22.pdf'
pdfinfo = pageinfo.PdfInfo(str(filename))
pdfinfo = pageinfo.PdfInfo(filename)
pdfimage = pdfinfo[0]['images'][0]
assert pdfimage['enc'] == 'jpeg'
@@ -104,7 +104,7 @@ def test_jpeg(resources, outdir):
def test_form_xobject(resources):
filename = resources / 'formxobject.pdf'
pdfinfo = pageinfo.PdfInfo(str(filename))
pdfinfo = pageinfo.PdfInfo(filename)
pdfimage = pdfinfo[0]['images'][0]
assert pdfimage['width'] == 50
@@ -112,6 +112,6 @@ def test_form_xobject(resources):
def test_no_contents(resources):
filename = resources / 'no_contents.pdf'
pdfinfo = pageinfo.PdfInfo(str(filename))
pdfinfo = pageinfo.PdfInfo(filename)
assert len(pdfinfo[0]['images']) == 0
assert pdfinfo[0]['has_text'] == False

View File

@@ -96,9 +96,9 @@ def test_skip_pages_does_not_replicate(
env=ensure_tess4
)
info_in = pageinfo.pdf_get_all_pageinfo(str(infile))
info_in = pageinfo.PdfInfo(infile)
info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
info = pageinfo.PdfInfo(outpdf)
for page in info:
assert len(page['images']) == 1, "skipped page was replicated"
@@ -115,6 +115,6 @@ def test_content_preservation(ensure_tess4, resources, outpdf):
env=ensure_tess4
)
info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
info = pageinfo.PdfInfo(outpdf)
page = info[0]
assert len(page['images']) > 1, "masked were rasterized"