diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 23b22761..0e0eb462 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -798,7 +798,7 @@ def ocr_tesseract_textonly_pdf( def get_docinfo(base_pdf, options): def from_document_info(key): try: - s = base_pdf.metadata[key] + s = base_pdf.docinfo[key] return str(s) except (KeyError, TypeError): return '' diff --git a/src/ocrmypdf/optimize.py b/src/ocrmypdf/optimize.py index 323a8656..6cc24954 100644 --- a/src/ocrmypdf/optimize.py +++ b/src/ocrmypdf/optimize.py @@ -299,8 +299,9 @@ def convert_to_jbig2(pike, jbig2_groups, root, log, options): jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike.get_object(xref, 0) im_obj.write( - jbig2_im_data, pikepdf.Name('/JBIG2Decode'), - jbig2_globals_dict + jbig2_im_data, + filter=pikepdf.Name('/JBIG2Decode'), + decode_parms=jbig2_globals_dict ) diff --git a/src/ocrmypdf/pdfinfo/__init__.py b/src/ocrmypdf/pdfinfo/__init__.py index 29f7b70f..9832c0f5 100644 --- a/src/ocrmypdf/pdfinfo/__init__.py +++ b/src/ocrmypdf/pdfinfo/__init__.py @@ -545,7 +545,7 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext): fspath(infile), pageno, xmltext=xmltext, height=height_pt) pageinfo['bboxes'] = bboxes else: - pscript5_mode = str(pdf.metadata.get('/Creator')).startswith('PScript5') + pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') miner = get_page_analysis(infile, pageno, pscript5_mode) pageinfo['textboxes'] = list(simplify_textboxes(miner)) bboxes = (box.bbox for box in pageinfo['textboxes']) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index dbfed2bc..8e027fa5 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -25,7 +25,7 @@ from unittest.mock import patch import datetime import pikepdf -from pikepdf.models.metadata import encode_pdf_date, decode_pdf_date +from pikepdf.models.metadata import decode_pdf_date from ocrmypdf.exceptions import ExitCode from ocrmypdf.helpers import fspath @@ -67,7 +67,7 @@ def test_preserve_metadata(spoof_tesseract_noop, output_type, pdf_after = pikepdf.open(output) for key in ('/Title', '/Author'): - assert pdf_before.metadata[key] == pdf_after.metadata[key] + assert pdf_before.docinfo[key] == pdf_after.docinfo[key] pdfa_info = file_claims_pdfa(str(output)) assert pdfa_info['output'] == output_type @@ -94,12 +94,12 @@ def test_override_metadata(spoof_tesseract_noop, output_type, resources, before = pikepdf.open(input_file) after = pikepdf.open(outpdf) - assert after.metadata.Title == german, after.metadata - assert after.metadata.Author == chinese, after.metadata - assert after.metadata.get('/Keywords', '') == '' + assert after.docinfo.Title == german, after.docinfo + assert after.docinfo.Author == chinese, after.docinfo + assert after.docinfo.get('/Keywords', '') == '' - before_date = decode_pdf_date(str(before.metadata.CreationDate)) - after_date = decode_pdf_date(str(after.metadata.CreationDate)) + before_date = decode_pdf_date(str(before.docinfo.CreationDate)) + after_date = decode_pdf_date(str(after.docinfo.CreationDate)) assert before_date == after_date pdfa_info = file_claims_pdfa(outpdf)