mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-04 12:48:02 -04:00
pdfinfo: Refactor pageinfo dictionary into a class
This commit is contained in:
@@ -150,7 +150,7 @@ def get_pdfinfo(
|
||||
progbar=False,
|
||||
max_workers=None,
|
||||
check_pages=None,
|
||||
):
|
||||
) -> PdfInfo:
|
||||
try:
|
||||
return PdfInfo(
|
||||
input_file,
|
||||
|
||||
@@ -556,12 +556,27 @@ def simplify_textboxes(miner, textbox_getter):
|
||||
yield TextboxInfo(box.bbox, visible, corrupt)
|
||||
|
||||
|
||||
class RawPageInfo:
|
||||
def __init__(self, pageno):
|
||||
self.pageno: int = pageno
|
||||
self.images: List = []
|
||||
self.textboxes: List = []
|
||||
self.has_text: Optional[bool] = False
|
||||
self.userunit: Decimal = Decimal(1.0)
|
||||
self.width_inches: Optional[Decimal] = None
|
||||
self.height_inches: Optional[Decimal] = None
|
||||
self.rotate: int = 0
|
||||
self.has_vector = False
|
||||
self.has_text = False
|
||||
self.dpi: Optional[Decimal] = None
|
||||
self.width_pixels: Optional[int] = None
|
||||
self.height_pixels: Optional[int] = None
|
||||
|
||||
|
||||
def _pdf_get_pageinfo(
|
||||
pdf, pageno: int, infile: PathLike, check_pages, detailed_analysis: bool
|
||||
):
|
||||
pageinfo: Dict[str, Any] = {}
|
||||
pageinfo['pageno'] = pageno
|
||||
pageinfo['images'] = []
|
||||
pageinfo = RawPageInfo(pageno)
|
||||
|
||||
page = pdf.pages[pageno]
|
||||
mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
|
||||
@@ -573,53 +588,53 @@ def _pdf_get_pageinfo(
|
||||
if check_this_page and detailed_analysis:
|
||||
pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
|
||||
miner = get_page_analysis(infile, pageno, pscript5_mode)
|
||||
pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
|
||||
bboxes = (box.bbox for box in pageinfo['textboxes'])
|
||||
pageinfo.textboxes = list(simplify_textboxes(miner, get_text_boxes))
|
||||
bboxes = (box.bbox for box in pageinfo.textboxes)
|
||||
|
||||
pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)
|
||||
pageinfo.has_text = _page_has_text(bboxes, width_pt, height_pt)
|
||||
else:
|
||||
pageinfo['textboxes'] = []
|
||||
pageinfo['has_text'] = None # i.e. "no information"
|
||||
pageinfo.textboxes = []
|
||||
pageinfo.has_text = None # i.e. "no information"
|
||||
|
||||
userunit = page.get('/UserUnit', Decimal(1.0))
|
||||
if not isinstance(userunit, Decimal):
|
||||
userunit = Decimal(userunit)
|
||||
pageinfo['userunit'] = userunit
|
||||
pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0)
|
||||
pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0)
|
||||
pageinfo.userunit = userunit
|
||||
pageinfo.width_inches = width_pt * userunit / Decimal(72.0)
|
||||
pageinfo.height_inches = height_pt * userunit / Decimal(72.0)
|
||||
|
||||
try:
|
||||
pageinfo['rotate'] = int(page['/Rotate'])
|
||||
pageinfo.rotate = int(page['/Rotate'])
|
||||
except KeyError:
|
||||
pageinfo['rotate'] = 0
|
||||
pageinfo.rotate = 0
|
||||
|
||||
userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)
|
||||
|
||||
if check_this_page:
|
||||
pageinfo['has_vector'] = False
|
||||
pageinfo['has_text'] = False
|
||||
pageinfo['images'] = []
|
||||
pageinfo.has_vector = False
|
||||
pageinfo.has_text = False
|
||||
pageinfo.images = []
|
||||
for ci in _process_content_streams(
|
||||
pdf=pdf, container=page, shorthand=userunit_shorthand
|
||||
):
|
||||
if isinstance(ci, VectorMarker):
|
||||
pageinfo['has_vector'] = True
|
||||
pageinfo.has_vector = True
|
||||
elif isinstance(ci, TextMarker):
|
||||
pageinfo['has_text'] = True
|
||||
pageinfo.has_text = True
|
||||
elif isinstance(ci, ImageInfo):
|
||||
pageinfo['images'].append(ci)
|
||||
pageinfo.images.append(ci)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
else:
|
||||
pageinfo['has_vector'] = None # i.e. "no information"
|
||||
pageinfo['has_text'] = None
|
||||
pageinfo['images'] = None
|
||||
pageinfo.has_vector = None # i.e. "no information"
|
||||
pageinfo.has_text = None
|
||||
pageinfo.images = None
|
||||
|
||||
if pageinfo['images']:
|
||||
dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in pageinfo['images'])
|
||||
pageinfo['dpi'] = dpi
|
||||
pageinfo['width_pixels'] = int(round(dpi.x * float(pageinfo['width_inches'])))
|
||||
pageinfo['height_pixels'] = int(round(dpi.y * float(pageinfo['height_inches'])))
|
||||
if pageinfo.images:
|
||||
dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in pageinfo.images)
|
||||
pageinfo.dpi = dpi
|
||||
pageinfo.width_pixels = int(round(dpi.x * float(pageinfo.width_inches)))
|
||||
pageinfo.height_pixels = int(round(dpi.y * float(pageinfo.height_inches)))
|
||||
|
||||
return pageinfo
|
||||
|
||||
@@ -707,25 +722,25 @@ class PageInfo:
|
||||
|
||||
@property
|
||||
def has_text(self) -> bool:
|
||||
return self._pageinfo['has_text']
|
||||
return self._pageinfo.has_text
|
||||
|
||||
@property
|
||||
def has_corrupt_text(self) -> bool:
|
||||
if not self._detailed_analysis:
|
||||
raise NotImplementedError('Did not do detailed analysis')
|
||||
return any(tbox.is_corrupt for tbox in self._pageinfo['textboxes'])
|
||||
return any(tbox.is_corrupt for tbox in self._pageinfo.textboxes)
|
||||
|
||||
@property
|
||||
def has_vector(self) -> bool:
|
||||
return self._pageinfo['has_vector']
|
||||
return self._pageinfo.has_vector
|
||||
|
||||
@property
|
||||
def width_inches(self) -> Decimal:
|
||||
return self._pageinfo['width_inches']
|
||||
return self._pageinfo.width_inches
|
||||
|
||||
@property
|
||||
def height_inches(self) -> Decimal:
|
||||
return self._pageinfo['height_inches']
|
||||
return self._pageinfo.height_inches
|
||||
|
||||
@property
|
||||
def width_pixels(self) -> int:
|
||||
@@ -737,18 +752,18 @@ class PageInfo:
|
||||
|
||||
@property
|
||||
def rotation(self) -> int:
|
||||
return self._pageinfo.get('rotate', None)
|
||||
return self._pageinfo.rotate
|
||||
|
||||
@rotation.setter
|
||||
def rotation(self, value):
|
||||
if value in (0, 90, 180, 270, 360, -90, -180, -270):
|
||||
self._pageinfo['rotate'] = value
|
||||
self._pageinfo.rotate = value
|
||||
else:
|
||||
raise ValueError("rotation must be a cardinal angle")
|
||||
|
||||
@property
|
||||
def images(self):
|
||||
return self._pageinfo['images']
|
||||
return self._pageinfo.images
|
||||
|
||||
def get_textareas(
|
||||
self, visible: Optional[bool] = None, corrupt: Optional[bool] = None
|
||||
@@ -763,24 +778,26 @@ class PageInfo:
|
||||
result = False
|
||||
return result
|
||||
|
||||
if 'textboxes' not in self._pageinfo:
|
||||
if not self._pageinfo.textboxes:
|
||||
if visible is not None and corrupt is not None:
|
||||
raise NotImplementedError('Incomplete information on textboxes')
|
||||
return self._pageinfo['bboxes']
|
||||
return self._pageinfo.textboxes
|
||||
|
||||
return (
|
||||
obj.bbox
|
||||
for obj in self._pageinfo['textboxes']
|
||||
for obj in self._pageinfo.textboxes
|
||||
if predicate(obj, visible, corrupt)
|
||||
)
|
||||
|
||||
@property
|
||||
def dpi(self) -> Resolution:
|
||||
return self._pageinfo.get('dpi', Resolution(0.0, 0.0))
|
||||
if self._pageinfo.dpi is None:
|
||||
return Resolution(0.0, 0.0)
|
||||
return self._pageinfo.dpi
|
||||
|
||||
@property
|
||||
def userunit(self) -> Decimal:
|
||||
return self._pageinfo.get('userunit', None)
|
||||
return self._pageinfo.userunit
|
||||
|
||||
@property
|
||||
def min_version(self) -> str:
|
||||
|
||||
@@ -92,10 +92,10 @@ def test_skip_ocr(resources, outpdf):
|
||||
|
||||
def test_redo_ocr(resources, outpdf):
|
||||
in_ = resources / 'graph_ocred.pdf'
|
||||
before = PdfInfo(in_)
|
||||
before = PdfInfo(in_, detailed_analysis=True)
|
||||
out = outpdf
|
||||
out = check_ocrmypdf(in_, out, '--redo-ocr')
|
||||
after = PdfInfo(out)
|
||||
after = PdfInfo(out, detailed_analysis=True)
|
||||
assert before[0].has_text and after[0].has_text
|
||||
assert (
|
||||
before[0].get_textareas() != after[0].get_textareas()
|
||||
|
||||
Reference in New Issue
Block a user