mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-07 06:07:58 -04:00
Merge branch 'pr/pajowu/1448'
This commit is contained in:
@@ -61,19 +61,31 @@ def strip_invisible_text(pdf: Pdf, page: Page):
|
||||
stream = []
|
||||
in_text_obj = False
|
||||
render_mode = 0
|
||||
render_mode_stack = []
|
||||
text_objects = []
|
||||
|
||||
for operands, operator in parse_content_stream(page, ''):
|
||||
if operator == Operator('Tr'):
|
||||
render_mode = operands[0]
|
||||
|
||||
if operator == Operator('q'):
|
||||
render_mode_stack.append(render_mode)
|
||||
|
||||
if operator == Operator('Q'):
|
||||
try:
|
||||
render_mode = render_mode_stack.pop()
|
||||
except IndexError:
|
||||
# Stack underflow: content stream is malformed
|
||||
# but try to carry on
|
||||
pass
|
||||
|
||||
if not in_text_obj:
|
||||
if operator == Operator('BT'):
|
||||
in_text_obj = True
|
||||
render_mode = 0
|
||||
text_objects.append((operands, operator))
|
||||
else:
|
||||
stream.append((operands, operator))
|
||||
else:
|
||||
if operator == Operator('Tr'):
|
||||
render_mode = operands[0]
|
||||
text_objects.append((operands, operator))
|
||||
if operator == Operator('ET'):
|
||||
in_text_obj = False
|
||||
|
||||
@@ -40,3 +40,72 @@ def test_links(resources, outpdf):
|
||||
p2 = pdf.pages[1]
|
||||
assert p1.Annots[0].A.D[0].objgen == p2.objgen
|
||||
assert p2.Annots[0].A.D[0].objgen == p1.objgen
|
||||
|
||||
|
||||
def test_strip_invisble_text():
|
||||
pdf = pikepdf.Pdf.new()
|
||||
print(pikepdf.parse_content_stream(pikepdf.Stream(pdf, b'3 Tr')))
|
||||
page = pdf.add_blank_page()
|
||||
visible_text = [
|
||||
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
|
||||
pikepdf.ContentStreamInstruction(
|
||||
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
|
||||
),
|
||||
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
|
||||
pikepdf.ContentStreamInstruction(
|
||||
(pikepdf.String('visible'),), pikepdf.Operator('Tj')
|
||||
),
|
||||
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
|
||||
]
|
||||
invisible_text = [
|
||||
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
|
||||
pikepdf.ContentStreamInstruction(
|
||||
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
|
||||
),
|
||||
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
|
||||
pikepdf.ContentStreamInstruction(
|
||||
(pikepdf.String('invisible'),), pikepdf.Operator('Tj')
|
||||
),
|
||||
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
|
||||
]
|
||||
invisible_text_setting_tr = [
|
||||
pikepdf.ContentStreamInstruction((), pikepdf.Operator('BT')),
|
||||
pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
|
||||
pikepdf.ContentStreamInstruction(
|
||||
(pikepdf.Name('/F0'), 12), pikepdf.Operator('Tf')
|
||||
),
|
||||
pikepdf.ContentStreamInstruction((288, 720), pikepdf.Operator('Td')),
|
||||
pikepdf.ContentStreamInstruction(
|
||||
(pikepdf.String('invisible'),), pikepdf.Operator('Tj')
|
||||
),
|
||||
pikepdf.ContentStreamInstruction((), pikepdf.Operator('ET')),
|
||||
]
|
||||
stream = [
|
||||
pikepdf.ContentStreamInstruction([], pikepdf.Operator('q')),
|
||||
pikepdf.ContentStreamInstruction([3], pikepdf.Operator('Tr')),
|
||||
*invisible_text,
|
||||
pikepdf.ContentStreamInstruction([], pikepdf.Operator('Q')),
|
||||
*visible_text,
|
||||
*invisible_text_setting_tr,
|
||||
*invisible_text,
|
||||
]
|
||||
content_stream = pikepdf.unparse_content_stream(stream)
|
||||
page.Contents = pikepdf.Stream(pdf, content_stream)
|
||||
|
||||
def count(string, page):
|
||||
return len(
|
||||
[
|
||||
True
|
||||
for operands, operator in pikepdf.parse_content_stream(page)
|
||||
if operator == pikepdf.Operator('Tj')
|
||||
and operands[0] == pikepdf.String(string)
|
||||
]
|
||||
)
|
||||
|
||||
nr_visible_pre = count('visible', page)
|
||||
ocrmypdf._graft.strip_invisible_text(pdf, page)
|
||||
nr_visible_post = count('visible', page)
|
||||
assert (
|
||||
nr_visible_pre == nr_visible_post
|
||||
), 'Number of visible text elements did not change'
|
||||
assert count('invisible', page) == 0, 'No invisible elems left'
|
||||
|
||||
Reference in New Issue
Block a user