mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-02-08 13:22:34 -05:00
@@ -3,4 +3,4 @@ repos:
|
||||
rev: stable
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.6
|
||||
language_version: python3.7
|
||||
|
||||
@@ -490,3 +490,17 @@ To install all of the development and test requirements:
|
||||
pip install -r requirements/dev.txt -r requirements/test.txt
|
||||
|
||||
To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
|
||||
Shell completions
|
||||
-----------------
|
||||
|
||||
Completions for ``bash`` and ``fish`` are available in the project's
|
||||
``misc/completion`` folder. The ``bash`` completions are likely ``zsh``
|
||||
compatible but this has not been confirmed. Package maintainers, please install
|
||||
these at the appropriate locations for your system.
|
||||
|
||||
To manually install the ``bash`` completion, copy ``misc/completion/ocrmypdf.bash`` to
|
||||
``/etc/bash_completion.d/ocrmypdf`` (rename the file).
|
||||
|
||||
To manually install the ``fish`` completion, copy ``misc/completion/ocrmypdf.fish`` to
|
||||
``~/.config/fish/completions/ocrmypdf.fish``.
|
||||
|
||||
@@ -13,6 +13,21 @@ Note that it is licensed under GPLv3, so scripts that ``import ocrmypdf`` and ar
|
||||
find: [^`]\#([0-9]{1,3})[^0-9]
|
||||
replace: `#$1 <https://github.com/jbarlow83/OCRmyPDF/issues/$1>`_
|
||||
|
||||
v8.3.0
|
||||
------
|
||||
|
||||
- Improved the strategy for updating pages when a new image of the page was produced. We know attempt to preserve more content from the original file, for annotations in particular.
|
||||
|
||||
- For PDFs with more than 100 pages and a sequence where one PDF page was replaced and one or more subsequent ones were skipped, an intermediate file would be corrupted while grafting OCR text, causing processing to fail.
|
||||
|
||||
- Previously, we resized the images produced by Ghostscript by a small number of pixels to ensure the output image size was an exactly what we wanted. Having discovered a way to get Ghostscript to produce the exact image sizes we require, we eliminated the resizing step.
|
||||
|
||||
- Command line completions for ``bash`` are now available, in addition to ``fish``, both in ``misc/completion``. Package maintainers, please install these so users can take advantage.
|
||||
|
||||
- Updated requirements.
|
||||
|
||||
- pikepdf 1.3.0 is now required.
|
||||
|
||||
v8.2.4
|
||||
------
|
||||
|
||||
@@ -22,7 +37,7 @@ v8.2.4
|
||||
|
||||
- Minor optimization: we no longer traverse the table of contents to ensure all references in it are resolved, as changes to libqpdf have made this unnecessary.
|
||||
|
||||
- pikepdf 1.2.0 is now required
|
||||
- pikepdf 1.2.0 is now required.
|
||||
|
||||
v8.2.3
|
||||
------
|
||||
|
||||
87
misc/completion/ocrmypdf.bash
Normal file
87
misc/completion/ocrmypdf.bash
Normal file
@@ -0,0 +1,87 @@
|
||||
# ocrmypdf completion -*- shell-script -*-
|
||||
|
||||
_ocrmypdf()
|
||||
{
|
||||
local cur prev cword words split
|
||||
_init_completion -s || return
|
||||
|
||||
case $prev in
|
||||
--version|-h|--help)
|
||||
return
|
||||
;;
|
||||
--user-words|--user-patterns|--tesseract-config)
|
||||
_filedir
|
||||
return
|
||||
;;
|
||||
--output-type)
|
||||
COMPREPLY=( $( compgen -W 'pdfa pdf pdfa-1 pdfa-2 pdfa-3' -- \
|
||||
"$cur" ) )
|
||||
return
|
||||
;;
|
||||
--pdf-renderer)
|
||||
COMPREPLY=( $( compgen -W 'auto hocr sandwich' -- "$cur" ) )
|
||||
return
|
||||
;;
|
||||
--pdfa-image-compression)
|
||||
COMPREPLY=( $( compgen -W 'auto jpeg lossless' -- "$cur" ) )
|
||||
return
|
||||
;;
|
||||
-O|--optimize|--tesseract-oem)
|
||||
COMPREPLY=( $( compgen -W '{0..3}' -- "$cur" ) )
|
||||
return
|
||||
;;
|
||||
--jpeg-quality|--png-quality)
|
||||
COMPREPLY=( $( compgen -W '{0..100}' -- "$cur" ) )
|
||||
return
|
||||
;;
|
||||
-l|--language)
|
||||
COMPREPLY=$( command tesseract --list-langs 2>/dev/null )
|
||||
COMPREPLY=( $( compgen -W '${COMPREPLY[@]##*:}' -- "$cur" ) )
|
||||
return
|
||||
;;
|
||||
--image-dpi|--oversample|--skip-big|--max-image-mpixels|\
|
||||
--tesseract-timeout|--rotate-pages-threshold)
|
||||
COMPREPLY=( $( compgen -P "$cur" -W '{0..9}' ) )
|
||||
return
|
||||
;;
|
||||
-j|--jobs)
|
||||
COMPREPLY=( $( compgen -W '{1..'$( _ncpus )'}' -- "$cur" ) )
|
||||
return
|
||||
;;
|
||||
-v|--verbose)
|
||||
COMPREPLY=( $( compgen -W '{1..9}' -- "$cur" ) ) # max level ?
|
||||
return
|
||||
;;
|
||||
--tesseract-pagesegmode)
|
||||
COMPREPLY=( $( compgen -W '{1..13}' -- "$cur" ) )
|
||||
return
|
||||
;;
|
||||
--sidecar|--title|--author|--subject|--keywords|--unpaper-args)
|
||||
# argument required but no completions available
|
||||
return
|
||||
;;
|
||||
esac
|
||||
|
||||
$split && return
|
||||
|
||||
if [[ $cur == -* ]]; then
|
||||
COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
|
||||
--sidecar --version --jobs --quiet --verbose --title --author
|
||||
--subject --keywords --rotate-pages --remove-background --deskew
|
||||
--clean --clean-final --unpaper-args --oversample --remove-vectors
|
||||
--mask-barcodes --threshold --force-ocr --skip-text --redo-ocr
|
||||
--skip-big --jpeg-quality --png-quality --jbig2-lossy
|
||||
--max-image-mpixels --tesseract-config --tesseract-pagesegmode
|
||||
--help --tesseract-oem --pdf-renderer --tesseract-timeout
|
||||
--rotate-pages-threshold --pdfa-image-compression --user-words
|
||||
--user-patterns --keep-temporary-files --flowchart --output-type' \
|
||||
-- "$cur" ) )
|
||||
return
|
||||
else
|
||||
_filedir
|
||||
return
|
||||
fi
|
||||
} &&
|
||||
complete -F _ocrmypdf ocrmypdf
|
||||
|
||||
# ex: filetype=sh
|
||||
@@ -5,7 +5,7 @@ chardet == 3.0.4
|
||||
cffi == 1.12.2
|
||||
img2pdf == 0.3.3
|
||||
pdfminer.six == 20181108
|
||||
pikepdf == 1.2.0
|
||||
pikepdf == 1.3.0
|
||||
Pillow >= 5.0.0, != 5.1.0 ; sys_platform == "darwin"
|
||||
pycparser == 2.19
|
||||
python-xmp-toolkit == 2.0.1
|
||||
|
||||
2
setup.py
2
setup.py
@@ -99,7 +99,7 @@ setup(
|
||||
'cffi >= 1.9.1', # must be a setup and install requirement
|
||||
'img2pdf >= 0.3.0, < 0.4', # pure Python, so track HEAD closely
|
||||
'pdfminer.six == 20181108 ; sys_platform != "darwin"',
|
||||
'pikepdf >= 1.2.0, < 2',
|
||||
'pikepdf >= 1.3.0, < 2',
|
||||
'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"',
|
||||
# Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3
|
||||
# block 5.1.0, broken wheels
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from contextlib import suppress
|
||||
from itertools import groupby
|
||||
from pathlib import Path
|
||||
import os
|
||||
@@ -188,99 +189,6 @@ def _find_font(text, pdf_base):
|
||||
return None, None
|
||||
|
||||
|
||||
def _traverse_toc(pdf_base, visitor_fn, log):
|
||||
"""
|
||||
Walk the table of contents, calling visitor_fn() at each node
|
||||
|
||||
The /Outlines data structure is a messy data structure, but rather than
|
||||
navigating hierarchically we just track unique nodes. Enqueue nodes when
|
||||
we find them, and never visit them again. set() is awesome. We look for
|
||||
the two types of object in the table of contents that can be page bookmarks
|
||||
and update the page entry.
|
||||
|
||||
"""
|
||||
|
||||
visited = set()
|
||||
queue = set()
|
||||
link_keys = ('/Parent', '/First', '/Last', '/Prev', '/Next')
|
||||
|
||||
if not '/Outlines' in pdf_base.root:
|
||||
return
|
||||
|
||||
queue.add(pdf_base.root.Outlines.objgen)
|
||||
while queue:
|
||||
objgen = queue.pop()
|
||||
visited.add(objgen)
|
||||
node = pdf_base.get_object(objgen)
|
||||
log.debug('fix toc: exploring outline entries at %r', objgen)
|
||||
|
||||
# Enumerate other nodes we could visit from here
|
||||
for key in link_keys:
|
||||
if key not in node:
|
||||
continue
|
||||
item = node[key]
|
||||
if not item.is_indirect:
|
||||
# Direct references are not allowed here, but it's not clear
|
||||
# what we should do if we find any. Removing them is an option:
|
||||
# node[key] = pdf_base.make_indirect(None)
|
||||
continue
|
||||
objgen = item.objgen
|
||||
if objgen not in visited:
|
||||
queue.add(objgen)
|
||||
|
||||
if visitor_fn:
|
||||
visitor_fn(pdf_base, node, log)
|
||||
|
||||
|
||||
def _fix_toc(pdf_base, pageref_remap, log):
|
||||
"""Repair the table of contents
|
||||
|
||||
Whenever we replace a page wholesale, it gets assigned a new objgen number
|
||||
and other references to it within the PDF become invalid, most notably in
|
||||
the table of contents (/Outlines in PDF-speak). In weave_layers we collect
|
||||
pageref_remap, a mapping that describes the new objgen number given an old
|
||||
one. (objgen is a tuple, and the gen is almost always zero.)
|
||||
|
||||
It may ultimately be better to find a way to rebuild a page in place.
|
||||
|
||||
"""
|
||||
|
||||
if not pageref_remap:
|
||||
return
|
||||
|
||||
def remap_dest(dest_node):
|
||||
"""
|
||||
Inner helper function: change the objgen for any page from the old we
|
||||
invalidated to its new one.
|
||||
"""
|
||||
try:
|
||||
pageref = dest_node[0]
|
||||
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
|
||||
new_objgen = pageref_remap[pageref.objgen]
|
||||
dest_node[0] = pdf_base.get_object(new_objgen)
|
||||
except (IndexError, TypeError) as e:
|
||||
log.warning("This file may contain invalid table of contents entries")
|
||||
log.debug(e)
|
||||
|
||||
def visit_remap_dest(pdf_base, node, log):
|
||||
"""
|
||||
Visitor function to fix ToC entries
|
||||
|
||||
Test for the two types of references to pages that can occur in ToCs.
|
||||
Both types have the same final format (an indirect reference to the
|
||||
target page).
|
||||
"""
|
||||
if '/Dest' in node:
|
||||
# /Dest reference to another page (old method)
|
||||
remap_dest(node['/Dest'])
|
||||
elif '/A' in node:
|
||||
# /A (action) command set to "GoTo" (newer method)
|
||||
if '/S' in node['/A'] and node['/A']['/S'] == '/GoTo':
|
||||
remap_dest(node['/A']['/D'])
|
||||
|
||||
_traverse_toc(pdf_base, visit_remap_dest, log)
|
||||
|
||||
|
||||
def weave_layers(infiles, output_file, log, context):
|
||||
"""Apply text layer and/or image layer changes to baseline file
|
||||
|
||||
@@ -323,13 +231,13 @@ def weave_layers(infiles, output_file, log, context):
|
||||
pdf_base = pikepdf.open(path_base)
|
||||
font, font_key, procset = None, None, None
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
pagerefs = {}
|
||||
|
||||
procset = pdf_base.make_indirect(
|
||||
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
|
||||
)
|
||||
|
||||
replacements = 0
|
||||
emplacements = 1
|
||||
interim_count = 0
|
||||
|
||||
# Iterate rest
|
||||
for page_num, layers in groups:
|
||||
@@ -343,30 +251,25 @@ def weave_layers(infiles, output_file, log, context):
|
||||
if text and not font:
|
||||
font, font_key = _find_font(text, pdf_base)
|
||||
|
||||
replacing = False
|
||||
emplaced_page = False
|
||||
content_rotation = pdfinfo[page_num - 1].rotation
|
||||
|
||||
path_image = Path(image).resolve() if image else None
|
||||
if path_image is not None and path_image != path_base:
|
||||
# We are replacing the old page with a rasterized PDF of the new
|
||||
# page
|
||||
log.debug("Replace")
|
||||
old_objgen = pdf_base.pages[page_num - 1].objgen
|
||||
|
||||
# We are updating the old page with a rasterized PDF of the new
|
||||
# page (without changing objgen, to preserve references)
|
||||
log.debug("Emplacement update")
|
||||
with pikepdf.open(image) as pdf_image:
|
||||
replacements += 1
|
||||
image_page = pdf_image.pages[0]
|
||||
pdf_base.pages[page_num - 1] = image_page
|
||||
|
||||
# We're adding a new page, which will get a new objgen number pair,
|
||||
# so we need to update any references to it. qpdf did not like
|
||||
# my attempt to update the old object in place, but that is an
|
||||
# option to consider
|
||||
pagerefs[old_objgen] = pdf_base.pages[page_num - 1].objgen
|
||||
replacing = True
|
||||
emplacements += 1
|
||||
foreign_image_page = pdf_image.pages[0]
|
||||
pdf_base.pages.append(foreign_image_page)
|
||||
local_image_page = pdf_base.pages[-1]
|
||||
pdf_base.pages[page_num - 1].emplace(local_image_page)
|
||||
del pdf_base.pages[-1]
|
||||
emplaced_page = True
|
||||
|
||||
autorotate_correction = context.get_rotation(page_num - 1)
|
||||
if replacing:
|
||||
if emplaced_page:
|
||||
content_rotation = autorotate_correction
|
||||
text_rotation = autorotate_correction
|
||||
text_misaligned = (text_rotation - content_rotation) % 360
|
||||
@@ -395,7 +298,7 @@ def weave_layers(infiles, output_file, log, context):
|
||||
content_rotation - autorotate_correction
|
||||
) % 360
|
||||
|
||||
if replacements % MAX_REPLACE_PAGES == 0:
|
||||
if emplacements % MAX_REPLACE_PAGES == 0:
|
||||
# Periodically save and reload the Pdf object. This will keep a
|
||||
# lid on our memory usage for very large files. Attach the font to
|
||||
# page 1 even if page 1 doesn't use it, so we have a way to get it
|
||||
@@ -405,14 +308,25 @@ def weave_layers(infiles, output_file, log, context):
|
||||
_update_page_resources(
|
||||
page=page0, font=font, font_key=font_key, procset=procset
|
||||
)
|
||||
interim = output_file + f'_working{page_num}.pdf'
|
||||
pdf_base.save(interim)
|
||||
|
||||
# We cannot read and write the same file, that will corrupt it
|
||||
# but we don't to keep more copies than we need to. Delete intermediates.
|
||||
# {interim_count} is the opened file we were updateing
|
||||
# {interim_count - 1} can be deleted
|
||||
# {interim_count + 1} is the new file will produce and open
|
||||
old_file = output_file + f'_working{interim_count - 1}.pdf'
|
||||
if not context.get_options().keep_temporary_files:
|
||||
with suppress(FileNotFoundError):
|
||||
os.unlink(old_file)
|
||||
|
||||
next_file = output_file + f'_working{interim_count + 1}.pdf'
|
||||
pdf_base.save(next_file)
|
||||
pdf_base.close()
|
||||
|
||||
pdf_base = pikepdf.open(interim)
|
||||
pdf_base = pikepdf.open(next_file)
|
||||
procset = pdf_base.pages[0].Resources.ProcSet
|
||||
font, font_key = None, None # Reacquire this information
|
||||
font, font_key = None, None # Ensure we reacquire this information
|
||||
interim_count += 1
|
||||
|
||||
_fix_toc(pdf_base, pagerefs, log)
|
||||
pdf_base.save(output_file)
|
||||
pdf_base.close()
|
||||
|
||||
@@ -129,8 +129,7 @@ def rasterize_pdf(
|
||||
:param filter_vector: if True, remove vector graphics objects
|
||||
:return:
|
||||
"""
|
||||
res = xres, yres
|
||||
int_res = round(xres), round(yres)
|
||||
res = round(xres, 6), round(yres, 6)
|
||||
if not page_dpi:
|
||||
page_dpi = res
|
||||
|
||||
@@ -145,7 +144,7 @@ def rasterize_pdf(
|
||||
f'-sDEVICE={raster_device}',
|
||||
f'-dFirstPage={pageno}',
|
||||
f'-dLastPage={pageno}',
|
||||
f'-r{str(int_res[0])}x{str(int_res[1])}',
|
||||
f'-r{res[0]:f}x{res[1]:f}',
|
||||
]
|
||||
+ (['-dFILTERVECTOR'] if filter_vector else [])
|
||||
+ [
|
||||
@@ -168,23 +167,8 @@ def rasterize_pdf(
|
||||
log.error('Ghostscript rasterizing failed')
|
||||
raise SubprocessOutputError()
|
||||
|
||||
# Ghostscript only accepts integers for output resolution
|
||||
# if the resolution happens to be fractional, then the discrepancy
|
||||
# would change the size of the output page, especially if the DPI
|
||||
# is quite low. Resize the image to the expected size
|
||||
|
||||
tmp.seek(0)
|
||||
with Image.open(tmp) as im:
|
||||
expected_size = (
|
||||
round(im.size[0] / int_res[0] * res[0]),
|
||||
round(im.size[1] / int_res[1] * res[1]),
|
||||
)
|
||||
if expected_size != im.size or page_dpi != (xres, yres):
|
||||
log.debug(
|
||||
f"Ghostscript: resize output image {im.size} -> {expected_size}"
|
||||
)
|
||||
im = im.resize(expected_size)
|
||||
|
||||
if rotation is not None:
|
||||
log.debug("Rotating output by %i", rotation)
|
||||
# rotation is a clockwise angle and Image.ROTATE_* is
|
||||
@@ -269,7 +253,6 @@ def generate_pdfa(
|
||||
"-dBATCH",
|
||||
"-dNOPAUSE",
|
||||
"-dCompatibilityLevel=" + str(pdf_version),
|
||||
"-dNumRenderingThreads=" + str(threads),
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dAutoRotatePages=/None",
|
||||
"-sColorConversionStrategy=" + strategy,
|
||||
|
||||
BIN
tests/resources/link.pdf
Normal file
BIN
tests/resources/link.pdf
Normal file
Binary file not shown.
81
tests/test_ghostscript.py
Normal file
81
tests/test_ghostscript.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# © 2019 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import logging
|
||||
from decimal import Decimal
|
||||
|
||||
import pikepdf
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from ocrmypdf.exec.ghostscript import rasterize_pdf
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def linn(resources):
|
||||
path = resources / 'linn.pdf'
|
||||
return path, pikepdf.open(path)
|
||||
|
||||
|
||||
def test_rasterize_size(linn, outdir, caplog):
|
||||
path, pdf = linn
|
||||
page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
|
||||
assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
|
||||
page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
|
||||
target_size = Decimal('200.0'), Decimal('150.0')
|
||||
target_dpi = 42.0, 4242.0
|
||||
|
||||
log = logging.getLogger()
|
||||
rasterize_pdf(
|
||||
path,
|
||||
outdir / 'out.png',
|
||||
target_size[0] / page_size[0],
|
||||
target_size[1] / page_size[1],
|
||||
raster_device='pngmono',
|
||||
log=log,
|
||||
page_dpi=target_dpi,
|
||||
)
|
||||
|
||||
with Image.open(outdir / 'out.png') as im:
|
||||
assert im.size == target_size
|
||||
assert im.info['dpi'] == target_dpi
|
||||
|
||||
|
||||
def test_rasterize_rotated(linn, outdir, caplog):
|
||||
path, pdf = linn
|
||||
page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
|
||||
assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
|
||||
page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
|
||||
target_size = Decimal('200.0'), Decimal('150.0')
|
||||
target_dpi = 42.0, 4242.0
|
||||
|
||||
log = logging.getLogger()
|
||||
caplog.set_level(logging.DEBUG)
|
||||
rasterize_pdf(
|
||||
path,
|
||||
outdir / 'out.png',
|
||||
target_size[0] / page_size[0],
|
||||
target_size[1] / page_size[1],
|
||||
raster_device='pngmono',
|
||||
log=log,
|
||||
page_dpi=target_dpi,
|
||||
rotation=90,
|
||||
)
|
||||
|
||||
with Image.open(outdir / 'out.png') as im:
|
||||
assert im.size == (target_size[1], target_size[0])
|
||||
assert im.info['dpi'] == (target_dpi[1], target_dpi[0])
|
||||
@@ -285,7 +285,7 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
|
||||
assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)
|
||||
|
||||
|
||||
def test_metadata_fixup_warning(resources, outdir):
|
||||
def test_metadata_fixup_warning(resources, outdir, caplog):
|
||||
from ocrmypdf._pipeline import metadata_fixup
|
||||
|
||||
input_files = [
|
||||
@@ -296,7 +296,7 @@ def test_metadata_fixup_warning(resources, outdir):
|
||||
for f in input_files:
|
||||
copyfile(resources / 'graph.pdf', f)
|
||||
|
||||
log = MagicMock()
|
||||
log = logging.getLogger()
|
||||
context = MagicMock()
|
||||
metadata_fixup(
|
||||
input_files_groups=input_files,
|
||||
@@ -304,7 +304,8 @@ def test_metadata_fixup_warning(resources, outdir):
|
||||
log=log,
|
||||
context=context,
|
||||
)
|
||||
log.warning.assert_not_called()
|
||||
for record in caplog.records:
|
||||
assert record.levelname != 'WARNING'
|
||||
|
||||
# Now add some metadata that will not be copyable
|
||||
graph = pikepdf.open(outdir / 'graph.repaired.pdf')
|
||||
@@ -312,7 +313,7 @@ def test_metadata_fixup_warning(resources, outdir):
|
||||
meta['prism2:publicationName'] = 'OCRmyPDF Test'
|
||||
graph.save(outdir / 'graph.repaired.pdf')
|
||||
|
||||
log = MagicMock()
|
||||
log = logging.getLogger()
|
||||
context = MagicMock()
|
||||
metadata_fixup(
|
||||
input_files_groups=input_files,
|
||||
@@ -320,7 +321,7 @@ def test_metadata_fixup_warning(resources, outdir):
|
||||
log=log,
|
||||
context=context,
|
||||
)
|
||||
log.warning.assert_called_once()
|
||||
assert any(record.levelname == 'WARNING' for record in caplog.records)
|
||||
|
||||
|
||||
def test_prevent_gs_invalid_xml(resources, outdir):
|
||||
|
||||
@@ -16,9 +16,10 @@
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from os import fspath
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -57,7 +58,7 @@ def test_no_unpaper(resources, no_outpdf):
|
||||
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
|
||||
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
|
||||
with pytest.raises(SystemExit):
|
||||
main.check_options(options, log=MagicMock())
|
||||
main.check_options(options, log=logging.getLogger())
|
||||
|
||||
|
||||
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
|
||||
|
||||
@@ -15,35 +15,15 @@
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
import pikepdf
|
||||
from ocrmypdf._weave import _fix_toc, _update_page_resources
|
||||
|
||||
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
||||
|
||||
|
||||
def test_invalid_toc(resources, outdir, caplog):
|
||||
pdf = pikepdf.open(resources / 'toc.pdf')
|
||||
|
||||
# Corrupt a TOC entry
|
||||
pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2])
|
||||
pdf.save(outdir / 'test.pdf')
|
||||
|
||||
pdf = pikepdf.open(outdir / 'test.pdf')
|
||||
remap = {}
|
||||
remap[pdf.pages[0].objgen] = pdf.pages[0].objgen # Dummy remap
|
||||
|
||||
# Confirm we complain about the TOC and don't throw an exception
|
||||
log = logging.getLogger()
|
||||
_fix_toc(pdf, remap, log)
|
||||
assert 'invalid table of contents entries' in caplog.text
|
||||
|
||||
|
||||
def test_no_glyphless_weave(resources, outdir):
|
||||
pdf = pikepdf.open(resources / 'francais.pdf')
|
||||
pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
|
||||
@@ -62,3 +42,21 @@ def test_no_glyphless_weave(resources, outdir):
|
||||
'0',
|
||||
env=env,
|
||||
)
|
||||
|
||||
|
||||
@pytest.helpers.needs_pdfminer
|
||||
def test_links(resources, outpdf):
|
||||
check_ocrmypdf(
|
||||
resources / 'link.pdf',
|
||||
outpdf,
|
||||
'--redo-ocr',
|
||||
'--oversample',
|
||||
'200',
|
||||
'--output-type',
|
||||
'pdf',
|
||||
)
|
||||
pdf = pikepdf.open(outpdf)
|
||||
p1 = pdf.pages[0]
|
||||
p2 = pdf.pages[1]
|
||||
assert p1.Annots[0].A.D[0].objgen == p2.objgen
|
||||
assert p2.Annots[0].A.D[0].objgen == p1.objgen
|
||||
|
||||
Reference in New Issue
Block a user