mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-19 03:58:06 -04:00
Refactor PdfInfo(str(filename)) -> PdfInfo(filename)
This commit is contained in:
@@ -9,6 +9,7 @@ import sys
|
||||
import PyPDF2 as pypdf
|
||||
from collections import namedtuple
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
matrix_mult = pypdf.pdf.utils.matrixMultiply
|
||||
@@ -517,6 +518,8 @@ def _pdf_get_pageinfo(infile, pageno: int):
|
||||
|
||||
|
||||
def pdf_get_all_pageinfo(infile):
|
||||
if isinstance(infile, Path):
|
||||
infile = str(infile)
|
||||
pdf = pypdf.PdfFileReader(infile)
|
||||
return [_pdf_get_pageinfo(infile, n) for n in range(pdf.numPages)]
|
||||
|
||||
|
||||
@@ -130,6 +130,6 @@ def run_ocrmypdf(input_file, output_file, *args, env=None):
|
||||
@pytest.helpers.register
|
||||
def first_page_dimensions(pdf):
|
||||
from ocrmypdf import pageinfo
|
||||
info = pageinfo.pdf_get_all_pageinfo(str(pdf))
|
||||
info = pageinfo.PdfInfo(pdf)
|
||||
page0 = info[0]
|
||||
return (page0['width_inches'], page0['height_inches'])
|
||||
|
||||
@@ -230,7 +230,7 @@ def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):
|
||||
'-f',
|
||||
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
||||
|
||||
pdfinfo = PdfInfo(str(oversampled_pdf))
|
||||
pdfinfo = PdfInfo(oversampled_pdf)
|
||||
|
||||
print(pdfinfo[0]['xres'])
|
||||
assert abs(pdfinfo[0]['xres'] - 350) < 1
|
||||
@@ -358,14 +358,14 @@ def test_autorotate_threshold(
|
||||
def test_ocr_timeout(renderer, resources, outpdf):
|
||||
out = check_ocrmypdf(resources / 'skew.pdf', outpdf,
|
||||
'--tesseract-timeout', '1.0')
|
||||
pdfinfo = PdfInfo(str(out))
|
||||
pdfinfo = PdfInfo(out)
|
||||
assert not pdfinfo[0]['has_text']
|
||||
|
||||
|
||||
def test_skip_big(spoof_tesseract_cache, resources, outpdf):
|
||||
out = check_ocrmypdf(resources / 'enormous.pdf', outpdf,
|
||||
'--skip-big', '10', env=spoof_tesseract_cache)
|
||||
pdfinfo = PdfInfo(str(out))
|
||||
pdfinfo = PdfInfo(out)
|
||||
assert not pdfinfo[0]['has_text']
|
||||
|
||||
|
||||
@@ -562,14 +562,14 @@ def test_algo4(resources, no_outpdf):
|
||||
def test_non_square_resolution(renderer, spoof_tesseract_cache,
|
||||
resources, outpdf):
|
||||
# Confirm input image is non-square resolution
|
||||
in_pageinfo = PdfInfo(str(resources / 'aspect.pdf'))
|
||||
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
|
||||
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
|
||||
|
||||
check_ocrmypdf(
|
||||
resources / 'aspect.pdf', outpdf,
|
||||
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
||||
|
||||
out_pageinfo = PdfInfo(str(outpdf))
|
||||
out_pageinfo = PdfInfo(outpdf)
|
||||
|
||||
# Confirm resolution was kept the same
|
||||
assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
|
||||
@@ -585,7 +585,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
|
||||
from math import isclose
|
||||
|
||||
# Confirm input image is non-square resolution
|
||||
in_pageinfo = PdfInfo(str(resources / 'aspect.pdf'))
|
||||
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
|
||||
assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
|
||||
|
||||
# --force-ocr requires means forced conversion to square resolution
|
||||
@@ -594,7 +594,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
|
||||
'--force-ocr',
|
||||
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
||||
|
||||
out_pageinfo = PdfInfo(str(outpdf))
|
||||
out_pageinfo = PdfInfo(outpdf)
|
||||
|
||||
in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]
|
||||
|
||||
@@ -628,7 +628,7 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
|
||||
'--pdf-renderer', 'hocr',
|
||||
env=spoof_tesseract_cache)
|
||||
|
||||
out_pageinfo = PdfInfo(str(out))
|
||||
out_pageinfo = PdfInfo(out)
|
||||
assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
|
||||
|
||||
|
||||
@@ -713,7 +713,7 @@ def test_rotated_skew_timeout(resources, outpdf):
|
||||
'--pdf-renderer', 'hocr',
|
||||
'--deskew', '--tesseract-timeout', '0')
|
||||
|
||||
out_pageinfo = PdfInfo(str(out))[0]
|
||||
out_pageinfo = PdfInfo(out)[0]
|
||||
|
||||
assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \
|
||||
"Expected the output page to be portrait"
|
||||
@@ -970,7 +970,7 @@ def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf):
|
||||
'--sidecar', sidecar,
|
||||
env=spoof_tesseract_cache)
|
||||
|
||||
pdfinfo = PdfInfo(str(resources / 'multipage.pdf'))
|
||||
pdfinfo = PdfInfo(resources / 'multipage.pdf')
|
||||
num_pages = len(pdfinfo)
|
||||
|
||||
with open(sidecar, 'r') as f:
|
||||
|
||||
@@ -26,7 +26,7 @@ def test_single_page_text(outdir):
|
||||
pdf.showPage()
|
||||
pdf.save()
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(str(filename))
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
|
||||
assert len(pdfinfo) == 1
|
||||
page = pdfinfo[0]
|
||||
@@ -53,7 +53,7 @@ def test_single_page_image(outdir):
|
||||
layout_fun=layout_fun)
|
||||
filename.write_bytes(pdf_bytes)
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(str(filename))
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
|
||||
assert len(pdfinfo) == 1
|
||||
page = pdfinfo[0]
|
||||
@@ -83,7 +83,7 @@ def test_single_page_inline_image(outdir):
|
||||
pdf.showPage()
|
||||
pdf.save()
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(str(filename))
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
print(pdfinfo)
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
assert (pdfimage['dpi_w'] - 8) < 1e-5
|
||||
@@ -94,7 +94,7 @@ def test_single_page_inline_image(outdir):
|
||||
def test_jpeg(resources, outdir):
|
||||
filename = resources / 'c02-22.pdf'
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(str(filename))
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
assert pdfimage['enc'] == 'jpeg'
|
||||
@@ -104,7 +104,7 @@ def test_jpeg(resources, outdir):
|
||||
def test_form_xobject(resources):
|
||||
filename = resources / 'formxobject.pdf'
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(str(filename))
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
assert pdfimage['width'] == 50
|
||||
|
||||
@@ -112,6 +112,6 @@ def test_form_xobject(resources):
|
||||
def test_no_contents(resources):
|
||||
filename = resources / 'no_contents.pdf'
|
||||
|
||||
pdfinfo = pageinfo.PdfInfo(str(filename))
|
||||
pdfinfo = pageinfo.PdfInfo(filename)
|
||||
assert len(pdfinfo[0]['images']) == 0
|
||||
assert pdfinfo[0]['has_text'] == False
|
||||
@@ -96,9 +96,9 @@ def test_skip_pages_does_not_replicate(
|
||||
env=ensure_tess4
|
||||
)
|
||||
|
||||
info_in = pageinfo.pdf_get_all_pageinfo(str(infile))
|
||||
info_in = pageinfo.PdfInfo(infile)
|
||||
|
||||
info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
|
||||
info = pageinfo.PdfInfo(outpdf)
|
||||
for page in info:
|
||||
assert len(page['images']) == 1, "skipped page was replicated"
|
||||
|
||||
@@ -115,6 +115,6 @@ def test_content_preservation(ensure_tess4, resources, outpdf):
|
||||
env=ensure_tess4
|
||||
)
|
||||
|
||||
info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
|
||||
info = pageinfo.PdfInfo(outpdf)
|
||||
page = info[0]
|
||||
assert len(page['images']) > 1, "masked were rasterized"
|
||||
Reference in New Issue
Block a user