Merge branch 'pr/pajowu/1448'

This commit is contained in:
James R. Barlow
2025-01-01 18:00:52 -08:00
2 changed files with 84 additions and 3 deletions

View File

@@ -61,19 +61,31 @@ def strip_invisible_text(pdf: Pdf, page: Page):
stream = []
in_text_obj = False
render_mode = 0
render_mode_stack = []
text_objects = []
for operands, operator in parse_content_stream(page, ''):
if operator == Operator('Tr'):
render_mode = operands[0]
if operator == Operator('q'):
render_mode_stack.append(render_mode)
if operator == Operator('Q'):
try:
render_mode = render_mode_stack.pop()
except IndexError:
# Stack underflow: content stream is malformed
# but try to carry on
pass
if not in_text_obj:
if operator == Operator('BT'):
in_text_obj = True
render_mode = 0
text_objects.append((operands, operator))
else:
stream.append((operands, operator))
else:
if operator == Operator('Tr'):
render_mode = operands[0]
text_objects.append((operands, operator))
if operator == Operator('ET'):
in_text_obj = False

View File

@@ -40,3 +40,72 @@ def test_links(resources, outpdf):
p2 = pdf.pages[1]
assert p1.Annots[0].A.D[0].objgen == p2.objgen
assert p2.Annots[0].A.D[0].objgen == p1.objgen
def test_strip_invisble_text():
pdf = pikepdf.Pdf.new()
print(pikepdf.parse_content_stream(pikepdf.Stream(pdf, b'3 Tr')))
page = pdf.add_blank_page()
visible_text = [
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
pikepdf.ContentStreamInstruction(
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
),
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
pikepdf.ContentStreamInstruction(
(pikepdf.String('visible'),), pikepdf.Operator('Tj')
),
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
]
invisible_text = [
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
pikepdf.ContentStreamInstruction(
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
),
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
pikepdf.ContentStreamInstruction(
(pikepdf.String('invisible'),), pikepdf.Operator('Tj')
),
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
]
invisible_text_setting_tr = [
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
pikepdf.ContentStreamInstruction(
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
),
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
pikepdf.ContentStreamInstruction(
(pikepdf.String('invisible'),), pikepdf.Operator('Tj')
),
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
]
stream = [
pikepdf.ContentStreamInstruction([], pikepdf.Operator('q')),
pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
*invisible_text,
pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q')),
*visible_text,
*invisible_text_setting_tr,
*invisible_text,
]
content_stream = pikepdf.unparse_content_stream(stream)
page.Contents = pikepdf.Stream(pdf, content_stream)
def count(string, page):
return len(
[
True
for operands, operator in pikepdf.parse_content_stream(page)
if operator == pikepdf.Operator('Tj')
and operands[0] == pikepdf.String(string)
]
)
nr_visible_pre = count('visible', page)
ocrmypdf._graft.strip_invisible_text(pdf, page)
nr_visible_post = count('visible', page)
assert (
nr_visible_pre == nr_visible_post
), 'Number of visible text elements did not change'
assert count('invisible', page) == 0, 'No invisible elems left'