mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-06-11 15:36:11 -04:00
@@ -3,4 +3,4 @@ repos:
|
|||||||
rev: stable
|
rev: stable
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
- id: black
|
||||||
language_version: python3.6
|
language_version: python3.7
|
||||||
|
|||||||
@@ -490,3 +490,17 @@ To install all of the development and test requirements:
|
|||||||
pip install -r requirements/dev.txt -r requirements/test.txt
|
pip install -r requirements/dev.txt -r requirements/test.txt
|
||||||
|
|
||||||
To add JBIG2 encoding, see :ref:`jbig2`.
|
To add JBIG2 encoding, see :ref:`jbig2`.
|
||||||
|
|
||||||
|
Shell completions
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
Completions for ``bash`` and ``fish`` are available in the project's
|
||||||
|
``misc/completion`` folder. The ``bash`` completions are likely ``zsh``
|
||||||
|
compatible but this has not been confirmed. Package maintainers, please install
|
||||||
|
these at the appropriate locations for your system.
|
||||||
|
|
||||||
|
To manually install the ``bash`` completion, copy ``misc/completion/ocrmypdf.bash`` to
|
||||||
|
``/etc/bash_completion.d/ocrmypdf`` (rename the file).
|
||||||
|
|
||||||
|
To manually install the ``fish`` completion, copy ``misc/completion/ocrmypdf.fish`` to
|
||||||
|
``~/.config/fish/completions/ocrmypdf.fish``.
|
||||||
|
|||||||
@@ -13,6 +13,21 @@ Note that it is licensed under GPLv3, so scripts that ``import ocrmypdf`` and ar
|
|||||||
find: [^`]\#([0-9]{1,3})[^0-9]
|
find: [^`]\#([0-9]{1,3})[^0-9]
|
||||||
replace: `#$1 <https://github.com/jbarlow83/OCRmyPDF/issues/$1>`_
|
replace: `#$1 <https://github.com/jbarlow83/OCRmyPDF/issues/$1>`_
|
||||||
|
|
||||||
|
v8.3.0
|
||||||
|
------
|
||||||
|
|
||||||
|
- Improved the strategy for updating pages when a new image of the page was produced. We know attempt to preserve more content from the original file, for annotations in particular.
|
||||||
|
|
||||||
|
- For PDFs with more than 100 pages and a sequence where one PDF page was replaced and one or more subsequent ones were skipped, an intermediate file would be corrupted while grafting OCR text, causing processing to fail.
|
||||||
|
|
||||||
|
- Previously, we resized the images produced by Ghostscript by a small number of pixels to ensure the output image size was an exactly what we wanted. Having discovered a way to get Ghostscript to produce the exact image sizes we require, we eliminated the resizing step.
|
||||||
|
|
||||||
|
- Command line completions for ``bash`` are now available, in addition to ``fish``, both in ``misc/completion``. Package maintainers, please install these so users can take advantage.
|
||||||
|
|
||||||
|
- Updated requirements.
|
||||||
|
|
||||||
|
- pikepdf 1.3.0 is now required.
|
||||||
|
|
||||||
v8.2.4
|
v8.2.4
|
||||||
------
|
------
|
||||||
|
|
||||||
@@ -22,7 +37,7 @@ v8.2.4
|
|||||||
|
|
||||||
- Minor optimization: we no longer traverse the table of contents to ensure all references in it are resolved, as changes to libqpdf have made this unnecessary.
|
- Minor optimization: we no longer traverse the table of contents to ensure all references in it are resolved, as changes to libqpdf have made this unnecessary.
|
||||||
|
|
||||||
- pikepdf 1.2.0 is now required
|
- pikepdf 1.2.0 is now required.
|
||||||
|
|
||||||
v8.2.3
|
v8.2.3
|
||||||
------
|
------
|
||||||
|
|||||||
87
misc/completion/ocrmypdf.bash
Normal file
87
misc/completion/ocrmypdf.bash
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
# ocrmypdf completion -*- shell-script -*-
|
||||||
|
|
||||||
|
_ocrmypdf()
|
||||||
|
{
|
||||||
|
local cur prev cword words split
|
||||||
|
_init_completion -s || return
|
||||||
|
|
||||||
|
case $prev in
|
||||||
|
--version|-h|--help)
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
--user-words|--user-patterns|--tesseract-config)
|
||||||
|
_filedir
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
--output-type)
|
||||||
|
COMPREPLY=( $( compgen -W 'pdfa pdf pdfa-1 pdfa-2 pdfa-3' -- \
|
||||||
|
"$cur" ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
--pdf-renderer)
|
||||||
|
COMPREPLY=( $( compgen -W 'auto hocr sandwich' -- "$cur" ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
--pdfa-image-compression)
|
||||||
|
COMPREPLY=( $( compgen -W 'auto jpeg lossless' -- "$cur" ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
-O|--optimize|--tesseract-oem)
|
||||||
|
COMPREPLY=( $( compgen -W '{0..3}' -- "$cur" ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
--jpeg-quality|--png-quality)
|
||||||
|
COMPREPLY=( $( compgen -W '{0..100}' -- "$cur" ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
-l|--language)
|
||||||
|
COMPREPLY=$( command tesseract --list-langs 2>/dev/null )
|
||||||
|
COMPREPLY=( $( compgen -W '${COMPREPLY[@]##*:}' -- "$cur" ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
--image-dpi|--oversample|--skip-big|--max-image-mpixels|\
|
||||||
|
--tesseract-timeout|--rotate-pages-threshold)
|
||||||
|
COMPREPLY=( $( compgen -P "$cur" -W '{0..9}' ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
-j|--jobs)
|
||||||
|
COMPREPLY=( $( compgen -W '{1..'$( _ncpus )'}' -- "$cur" ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
-v|--verbose)
|
||||||
|
COMPREPLY=( $( compgen -W '{1..9}' -- "$cur" ) ) # max level ?
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
--tesseract-pagesegmode)
|
||||||
|
COMPREPLY=( $( compgen -W '{1..13}' -- "$cur" ) )
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
--sidecar|--title|--author|--subject|--keywords|--unpaper-args)
|
||||||
|
# argument required but no completions available
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
$split && return
|
||||||
|
|
||||||
|
if [[ $cur == -* ]]; then
|
||||||
|
COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
|
||||||
|
--sidecar --version --jobs --quiet --verbose --title --author
|
||||||
|
--subject --keywords --rotate-pages --remove-background --deskew
|
||||||
|
--clean --clean-final --unpaper-args --oversample --remove-vectors
|
||||||
|
--mask-barcodes --threshold --force-ocr --skip-text --redo-ocr
|
||||||
|
--skip-big --jpeg-quality --png-quality --jbig2-lossy
|
||||||
|
--max-image-mpixels --tesseract-config --tesseract-pagesegmode
|
||||||
|
--help --tesseract-oem --pdf-renderer --tesseract-timeout
|
||||||
|
--rotate-pages-threshold --pdfa-image-compression --user-words
|
||||||
|
--user-patterns --keep-temporary-files --flowchart --output-type' \
|
||||||
|
-- "$cur" ) )
|
||||||
|
return
|
||||||
|
else
|
||||||
|
_filedir
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
} &&
|
||||||
|
complete -F _ocrmypdf ocrmypdf
|
||||||
|
|
||||||
|
# ex: filetype=sh
|
||||||
@@ -5,7 +5,7 @@ chardet == 3.0.4
|
|||||||
cffi == 1.12.2
|
cffi == 1.12.2
|
||||||
img2pdf == 0.3.3
|
img2pdf == 0.3.3
|
||||||
pdfminer.six == 20181108
|
pdfminer.six == 20181108
|
||||||
pikepdf == 1.2.0
|
pikepdf == 1.3.0
|
||||||
Pillow >= 5.0.0, != 5.1.0 ; sys_platform == "darwin"
|
Pillow >= 5.0.0, != 5.1.0 ; sys_platform == "darwin"
|
||||||
pycparser == 2.19
|
pycparser == 2.19
|
||||||
python-xmp-toolkit == 2.0.1
|
python-xmp-toolkit == 2.0.1
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -99,7 +99,7 @@ setup(
|
|||||||
'cffi >= 1.9.1', # must be a setup and install requirement
|
'cffi >= 1.9.1', # must be a setup and install requirement
|
||||||
'img2pdf >= 0.3.0, < 0.4', # pure Python, so track HEAD closely
|
'img2pdf >= 0.3.0, < 0.4', # pure Python, so track HEAD closely
|
||||||
'pdfminer.six == 20181108 ; sys_platform != "darwin"',
|
'pdfminer.six == 20181108 ; sys_platform != "darwin"',
|
||||||
'pikepdf >= 1.2.0, < 2',
|
'pikepdf >= 1.3.0, < 2',
|
||||||
'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"',
|
'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"',
|
||||||
# Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3
|
# Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3
|
||||||
# block 5.1.0, broken wheels
|
# block 5.1.0, broken wheels
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from contextlib import suppress
|
||||||
from itertools import groupby
|
from itertools import groupby
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import os
|
import os
|
||||||
@@ -188,99 +189,6 @@ def _find_font(text, pdf_base):
|
|||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
def _traverse_toc(pdf_base, visitor_fn, log):
|
|
||||||
"""
|
|
||||||
Walk the table of contents, calling visitor_fn() at each node
|
|
||||||
|
|
||||||
The /Outlines data structure is a messy data structure, but rather than
|
|
||||||
navigating hierarchically we just track unique nodes. Enqueue nodes when
|
|
||||||
we find them, and never visit them again. set() is awesome. We look for
|
|
||||||
the two types of object in the table of contents that can be page bookmarks
|
|
||||||
and update the page entry.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
visited = set()
|
|
||||||
queue = set()
|
|
||||||
link_keys = ('/Parent', '/First', '/Last', '/Prev', '/Next')
|
|
||||||
|
|
||||||
if not '/Outlines' in pdf_base.root:
|
|
||||||
return
|
|
||||||
|
|
||||||
queue.add(pdf_base.root.Outlines.objgen)
|
|
||||||
while queue:
|
|
||||||
objgen = queue.pop()
|
|
||||||
visited.add(objgen)
|
|
||||||
node = pdf_base.get_object(objgen)
|
|
||||||
log.debug('fix toc: exploring outline entries at %r', objgen)
|
|
||||||
|
|
||||||
# Enumerate other nodes we could visit from here
|
|
||||||
for key in link_keys:
|
|
||||||
if key not in node:
|
|
||||||
continue
|
|
||||||
item = node[key]
|
|
||||||
if not item.is_indirect:
|
|
||||||
# Direct references are not allowed here, but it's not clear
|
|
||||||
# what we should do if we find any. Removing them is an option:
|
|
||||||
# node[key] = pdf_base.make_indirect(None)
|
|
||||||
continue
|
|
||||||
objgen = item.objgen
|
|
||||||
if objgen not in visited:
|
|
||||||
queue.add(objgen)
|
|
||||||
|
|
||||||
if visitor_fn:
|
|
||||||
visitor_fn(pdf_base, node, log)
|
|
||||||
|
|
||||||
|
|
||||||
def _fix_toc(pdf_base, pageref_remap, log):
|
|
||||||
"""Repair the table of contents
|
|
||||||
|
|
||||||
Whenever we replace a page wholesale, it gets assigned a new objgen number
|
|
||||||
and other references to it within the PDF become invalid, most notably in
|
|
||||||
the table of contents (/Outlines in PDF-speak). In weave_layers we collect
|
|
||||||
pageref_remap, a mapping that describes the new objgen number given an old
|
|
||||||
one. (objgen is a tuple, and the gen is almost always zero.)
|
|
||||||
|
|
||||||
It may ultimately be better to find a way to rebuild a page in place.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
if not pageref_remap:
|
|
||||||
return
|
|
||||||
|
|
||||||
def remap_dest(dest_node):
|
|
||||||
"""
|
|
||||||
Inner helper function: change the objgen for any page from the old we
|
|
||||||
invalidated to its new one.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
pageref = dest_node[0]
|
|
||||||
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
|
|
||||||
new_objgen = pageref_remap[pageref.objgen]
|
|
||||||
dest_node[0] = pdf_base.get_object(new_objgen)
|
|
||||||
except (IndexError, TypeError) as e:
|
|
||||||
log.warning("This file may contain invalid table of contents entries")
|
|
||||||
log.debug(e)
|
|
||||||
|
|
||||||
def visit_remap_dest(pdf_base, node, log):
|
|
||||||
"""
|
|
||||||
Visitor function to fix ToC entries
|
|
||||||
|
|
||||||
Test for the two types of references to pages that can occur in ToCs.
|
|
||||||
Both types have the same final format (an indirect reference to the
|
|
||||||
target page).
|
|
||||||
"""
|
|
||||||
if '/Dest' in node:
|
|
||||||
# /Dest reference to another page (old method)
|
|
||||||
remap_dest(node['/Dest'])
|
|
||||||
elif '/A' in node:
|
|
||||||
# /A (action) command set to "GoTo" (newer method)
|
|
||||||
if '/S' in node['/A'] and node['/A']['/S'] == '/GoTo':
|
|
||||||
remap_dest(node['/A']['/D'])
|
|
||||||
|
|
||||||
_traverse_toc(pdf_base, visit_remap_dest, log)
|
|
||||||
|
|
||||||
|
|
||||||
def weave_layers(infiles, output_file, log, context):
|
def weave_layers(infiles, output_file, log, context):
|
||||||
"""Apply text layer and/or image layer changes to baseline file
|
"""Apply text layer and/or image layer changes to baseline file
|
||||||
|
|
||||||
@@ -323,13 +231,13 @@ def weave_layers(infiles, output_file, log, context):
|
|||||||
pdf_base = pikepdf.open(path_base)
|
pdf_base = pikepdf.open(path_base)
|
||||||
font, font_key, procset = None, None, None
|
font, font_key, procset = None, None, None
|
||||||
pdfinfo = context.get_pdfinfo()
|
pdfinfo = context.get_pdfinfo()
|
||||||
pagerefs = {}
|
|
||||||
|
|
||||||
procset = pdf_base.make_indirect(
|
procset = pdf_base.make_indirect(
|
||||||
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
|
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
|
||||||
)
|
)
|
||||||
|
|
||||||
replacements = 0
|
emplacements = 1
|
||||||
|
interim_count = 0
|
||||||
|
|
||||||
# Iterate rest
|
# Iterate rest
|
||||||
for page_num, layers in groups:
|
for page_num, layers in groups:
|
||||||
@@ -343,30 +251,25 @@ def weave_layers(infiles, output_file, log, context):
|
|||||||
if text and not font:
|
if text and not font:
|
||||||
font, font_key = _find_font(text, pdf_base)
|
font, font_key = _find_font(text, pdf_base)
|
||||||
|
|
||||||
replacing = False
|
emplaced_page = False
|
||||||
content_rotation = pdfinfo[page_num - 1].rotation
|
content_rotation = pdfinfo[page_num - 1].rotation
|
||||||
|
|
||||||
path_image = Path(image).resolve() if image else None
|
path_image = Path(image).resolve() if image else None
|
||||||
if path_image is not None and path_image != path_base:
|
if path_image is not None and path_image != path_base:
|
||||||
# We are replacing the old page with a rasterized PDF of the new
|
# We are updating the old page with a rasterized PDF of the new
|
||||||
# page
|
# page (without changing objgen, to preserve references)
|
||||||
log.debug("Replace")
|
log.debug("Emplacement update")
|
||||||
old_objgen = pdf_base.pages[page_num - 1].objgen
|
|
||||||
|
|
||||||
with pikepdf.open(image) as pdf_image:
|
with pikepdf.open(image) as pdf_image:
|
||||||
replacements += 1
|
emplacements += 1
|
||||||
image_page = pdf_image.pages[0]
|
foreign_image_page = pdf_image.pages[0]
|
||||||
pdf_base.pages[page_num - 1] = image_page
|
pdf_base.pages.append(foreign_image_page)
|
||||||
|
local_image_page = pdf_base.pages[-1]
|
||||||
# We're adding a new page, which will get a new objgen number pair,
|
pdf_base.pages[page_num - 1].emplace(local_image_page)
|
||||||
# so we need to update any references to it. qpdf did not like
|
del pdf_base.pages[-1]
|
||||||
# my attempt to update the old object in place, but that is an
|
emplaced_page = True
|
||||||
# option to consider
|
|
||||||
pagerefs[old_objgen] = pdf_base.pages[page_num - 1].objgen
|
|
||||||
replacing = True
|
|
||||||
|
|
||||||
autorotate_correction = context.get_rotation(page_num - 1)
|
autorotate_correction = context.get_rotation(page_num - 1)
|
||||||
if replacing:
|
if emplaced_page:
|
||||||
content_rotation = autorotate_correction
|
content_rotation = autorotate_correction
|
||||||
text_rotation = autorotate_correction
|
text_rotation = autorotate_correction
|
||||||
text_misaligned = (text_rotation - content_rotation) % 360
|
text_misaligned = (text_rotation - content_rotation) % 360
|
||||||
@@ -395,7 +298,7 @@ def weave_layers(infiles, output_file, log, context):
|
|||||||
content_rotation - autorotate_correction
|
content_rotation - autorotate_correction
|
||||||
) % 360
|
) % 360
|
||||||
|
|
||||||
if replacements % MAX_REPLACE_PAGES == 0:
|
if emplacements % MAX_REPLACE_PAGES == 0:
|
||||||
# Periodically save and reload the Pdf object. This will keep a
|
# Periodically save and reload the Pdf object. This will keep a
|
||||||
# lid on our memory usage for very large files. Attach the font to
|
# lid on our memory usage for very large files. Attach the font to
|
||||||
# page 1 even if page 1 doesn't use it, so we have a way to get it
|
# page 1 even if page 1 doesn't use it, so we have a way to get it
|
||||||
@@ -405,14 +308,25 @@ def weave_layers(infiles, output_file, log, context):
|
|||||||
_update_page_resources(
|
_update_page_resources(
|
||||||
page=page0, font=font, font_key=font_key, procset=procset
|
page=page0, font=font, font_key=font_key, procset=procset
|
||||||
)
|
)
|
||||||
interim = output_file + f'_working{page_num}.pdf'
|
|
||||||
pdf_base.save(interim)
|
# We cannot read and write the same file, that will corrupt it
|
||||||
|
# but we don't to keep more copies than we need to. Delete intermediates.
|
||||||
|
# {interim_count} is the opened file we were updateing
|
||||||
|
# {interim_count - 1} can be deleted
|
||||||
|
# {interim_count + 1} is the new file will produce and open
|
||||||
|
old_file = output_file + f'_working{interim_count - 1}.pdf'
|
||||||
|
if not context.get_options().keep_temporary_files:
|
||||||
|
with suppress(FileNotFoundError):
|
||||||
|
os.unlink(old_file)
|
||||||
|
|
||||||
|
next_file = output_file + f'_working{interim_count + 1}.pdf'
|
||||||
|
pdf_base.save(next_file)
|
||||||
pdf_base.close()
|
pdf_base.close()
|
||||||
|
|
||||||
pdf_base = pikepdf.open(interim)
|
pdf_base = pikepdf.open(next_file)
|
||||||
procset = pdf_base.pages[0].Resources.ProcSet
|
procset = pdf_base.pages[0].Resources.ProcSet
|
||||||
font, font_key = None, None # Reacquire this information
|
font, font_key = None, None # Ensure we reacquire this information
|
||||||
|
interim_count += 1
|
||||||
|
|
||||||
_fix_toc(pdf_base, pagerefs, log)
|
|
||||||
pdf_base.save(output_file)
|
pdf_base.save(output_file)
|
||||||
pdf_base.close()
|
pdf_base.close()
|
||||||
|
|||||||
@@ -129,8 +129,7 @@ def rasterize_pdf(
|
|||||||
:param filter_vector: if True, remove vector graphics objects
|
:param filter_vector: if True, remove vector graphics objects
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
res = xres, yres
|
res = round(xres, 6), round(yres, 6)
|
||||||
int_res = round(xres), round(yres)
|
|
||||||
if not page_dpi:
|
if not page_dpi:
|
||||||
page_dpi = res
|
page_dpi = res
|
||||||
|
|
||||||
@@ -145,7 +144,7 @@ def rasterize_pdf(
|
|||||||
f'-sDEVICE={raster_device}',
|
f'-sDEVICE={raster_device}',
|
||||||
f'-dFirstPage={pageno}',
|
f'-dFirstPage={pageno}',
|
||||||
f'-dLastPage={pageno}',
|
f'-dLastPage={pageno}',
|
||||||
f'-r{str(int_res[0])}x{str(int_res[1])}',
|
f'-r{res[0]:f}x{res[1]:f}',
|
||||||
]
|
]
|
||||||
+ (['-dFILTERVECTOR'] if filter_vector else [])
|
+ (['-dFILTERVECTOR'] if filter_vector else [])
|
||||||
+ [
|
+ [
|
||||||
@@ -168,23 +167,8 @@ def rasterize_pdf(
|
|||||||
log.error('Ghostscript rasterizing failed')
|
log.error('Ghostscript rasterizing failed')
|
||||||
raise SubprocessOutputError()
|
raise SubprocessOutputError()
|
||||||
|
|
||||||
# Ghostscript only accepts integers for output resolution
|
|
||||||
# if the resolution happens to be fractional, then the discrepancy
|
|
||||||
# would change the size of the output page, especially if the DPI
|
|
||||||
# is quite low. Resize the image to the expected size
|
|
||||||
|
|
||||||
tmp.seek(0)
|
tmp.seek(0)
|
||||||
with Image.open(tmp) as im:
|
with Image.open(tmp) as im:
|
||||||
expected_size = (
|
|
||||||
round(im.size[0] / int_res[0] * res[0]),
|
|
||||||
round(im.size[1] / int_res[1] * res[1]),
|
|
||||||
)
|
|
||||||
if expected_size != im.size or page_dpi != (xres, yres):
|
|
||||||
log.debug(
|
|
||||||
f"Ghostscript: resize output image {im.size} -> {expected_size}"
|
|
||||||
)
|
|
||||||
im = im.resize(expected_size)
|
|
||||||
|
|
||||||
if rotation is not None:
|
if rotation is not None:
|
||||||
log.debug("Rotating output by %i", rotation)
|
log.debug("Rotating output by %i", rotation)
|
||||||
# rotation is a clockwise angle and Image.ROTATE_* is
|
# rotation is a clockwise angle and Image.ROTATE_* is
|
||||||
@@ -269,7 +253,6 @@ def generate_pdfa(
|
|||||||
"-dBATCH",
|
"-dBATCH",
|
||||||
"-dNOPAUSE",
|
"-dNOPAUSE",
|
||||||
"-dCompatibilityLevel=" + str(pdf_version),
|
"-dCompatibilityLevel=" + str(pdf_version),
|
||||||
"-dNumRenderingThreads=" + str(threads),
|
|
||||||
"-sDEVICE=pdfwrite",
|
"-sDEVICE=pdfwrite",
|
||||||
"-dAutoRotatePages=/None",
|
"-dAutoRotatePages=/None",
|
||||||
"-sColorConversionStrategy=" + strategy,
|
"-sColorConversionStrategy=" + strategy,
|
||||||
|
|||||||
BIN
tests/resources/link.pdf
Normal file
BIN
tests/resources/link.pdf
Normal file
Binary file not shown.
81
tests/test_ghostscript.py
Normal file
81
tests/test_ghostscript.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
# © 2019 James R. Barlow: github.com/jbarlow83
|
||||||
|
#
|
||||||
|
# This file is part of OCRmyPDF.
|
||||||
|
#
|
||||||
|
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
import pikepdf
|
||||||
|
import pytest
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from ocrmypdf.exec.ghostscript import rasterize_pdf
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def linn(resources):
|
||||||
|
path = resources / 'linn.pdf'
|
||||||
|
return path, pikepdf.open(path)
|
||||||
|
|
||||||
|
|
||||||
|
def test_rasterize_size(linn, outdir, caplog):
|
||||||
|
path, pdf = linn
|
||||||
|
page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
|
||||||
|
assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
|
||||||
|
page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
|
||||||
|
target_size = Decimal('200.0'), Decimal('150.0')
|
||||||
|
target_dpi = 42.0, 4242.0
|
||||||
|
|
||||||
|
log = logging.getLogger()
|
||||||
|
rasterize_pdf(
|
||||||
|
path,
|
||||||
|
outdir / 'out.png',
|
||||||
|
target_size[0] / page_size[0],
|
||||||
|
target_size[1] / page_size[1],
|
||||||
|
raster_device='pngmono',
|
||||||
|
log=log,
|
||||||
|
page_dpi=target_dpi,
|
||||||
|
)
|
||||||
|
|
||||||
|
with Image.open(outdir / 'out.png') as im:
|
||||||
|
assert im.size == target_size
|
||||||
|
assert im.info['dpi'] == target_dpi
|
||||||
|
|
||||||
|
|
||||||
|
def test_rasterize_rotated(linn, outdir, caplog):
|
||||||
|
path, pdf = linn
|
||||||
|
page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
|
||||||
|
assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
|
||||||
|
page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
|
||||||
|
target_size = Decimal('200.0'), Decimal('150.0')
|
||||||
|
target_dpi = 42.0, 4242.0
|
||||||
|
|
||||||
|
log = logging.getLogger()
|
||||||
|
caplog.set_level(logging.DEBUG)
|
||||||
|
rasterize_pdf(
|
||||||
|
path,
|
||||||
|
outdir / 'out.png',
|
||||||
|
target_size[0] / page_size[0],
|
||||||
|
target_size[1] / page_size[1],
|
||||||
|
raster_device='pngmono',
|
||||||
|
log=log,
|
||||||
|
page_dpi=target_dpi,
|
||||||
|
rotation=90,
|
||||||
|
)
|
||||||
|
|
||||||
|
with Image.open(outdir / 'out.png') as im:
|
||||||
|
assert im.size == (target_size[1], target_size[0])
|
||||||
|
assert im.info['dpi'] == (target_dpi[1], target_dpi[0])
|
||||||
@@ -285,7 +285,7 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
|
|||||||
assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)
|
assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)
|
||||||
|
|
||||||
|
|
||||||
def test_metadata_fixup_warning(resources, outdir):
|
def test_metadata_fixup_warning(resources, outdir, caplog):
|
||||||
from ocrmypdf._pipeline import metadata_fixup
|
from ocrmypdf._pipeline import metadata_fixup
|
||||||
|
|
||||||
input_files = [
|
input_files = [
|
||||||
@@ -296,7 +296,7 @@ def test_metadata_fixup_warning(resources, outdir):
|
|||||||
for f in input_files:
|
for f in input_files:
|
||||||
copyfile(resources / 'graph.pdf', f)
|
copyfile(resources / 'graph.pdf', f)
|
||||||
|
|
||||||
log = MagicMock()
|
log = logging.getLogger()
|
||||||
context = MagicMock()
|
context = MagicMock()
|
||||||
metadata_fixup(
|
metadata_fixup(
|
||||||
input_files_groups=input_files,
|
input_files_groups=input_files,
|
||||||
@@ -304,7 +304,8 @@ def test_metadata_fixup_warning(resources, outdir):
|
|||||||
log=log,
|
log=log,
|
||||||
context=context,
|
context=context,
|
||||||
)
|
)
|
||||||
log.warning.assert_not_called()
|
for record in caplog.records:
|
||||||
|
assert record.levelname != 'WARNING'
|
||||||
|
|
||||||
# Now add some metadata that will not be copyable
|
# Now add some metadata that will not be copyable
|
||||||
graph = pikepdf.open(outdir / 'graph.repaired.pdf')
|
graph = pikepdf.open(outdir / 'graph.repaired.pdf')
|
||||||
@@ -312,7 +313,7 @@ def test_metadata_fixup_warning(resources, outdir):
|
|||||||
meta['prism2:publicationName'] = 'OCRmyPDF Test'
|
meta['prism2:publicationName'] = 'OCRmyPDF Test'
|
||||||
graph.save(outdir / 'graph.repaired.pdf')
|
graph.save(outdir / 'graph.repaired.pdf')
|
||||||
|
|
||||||
log = MagicMock()
|
log = logging.getLogger()
|
||||||
context = MagicMock()
|
context = MagicMock()
|
||||||
metadata_fixup(
|
metadata_fixup(
|
||||||
input_files_groups=input_files,
|
input_files_groups=input_files,
|
||||||
@@ -320,7 +321,7 @@ def test_metadata_fixup_warning(resources, outdir):
|
|||||||
log=log,
|
log=log,
|
||||||
context=context,
|
context=context,
|
||||||
)
|
)
|
||||||
log.warning.assert_called_once()
|
assert any(record.levelname == 'WARNING' for record in caplog.records)
|
||||||
|
|
||||||
|
|
||||||
def test_prevent_gs_invalid_xml(resources, outdir):
|
def test_prevent_gs_invalid_xml(resources, outdir):
|
||||||
|
|||||||
@@ -16,9 +16,10 @@
|
|||||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from os import fspath
|
from os import fspath
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@@ -57,7 +58,7 @@ def test_no_unpaper(resources, no_outpdf):
|
|||||||
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
|
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
|
||||||
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
|
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
main.check_options(options, log=MagicMock())
|
main.check_options(options, log=logging.getLogger())
|
||||||
|
|
||||||
|
|
||||||
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
|
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
|
||||||
|
|||||||
@@ -15,35 +15,15 @@
|
|||||||
# You should have received a copy of the GNU General Public License
|
# You should have received a copy of the GNU General Public License
|
||||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import pikepdf
|
import pikepdf
|
||||||
from ocrmypdf._weave import _fix_toc, _update_page_resources
|
|
||||||
|
|
||||||
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_toc(resources, outdir, caplog):
|
|
||||||
pdf = pikepdf.open(resources / 'toc.pdf')
|
|
||||||
|
|
||||||
# Corrupt a TOC entry
|
|
||||||
pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2])
|
|
||||||
pdf.save(outdir / 'test.pdf')
|
|
||||||
|
|
||||||
pdf = pikepdf.open(outdir / 'test.pdf')
|
|
||||||
remap = {}
|
|
||||||
remap[pdf.pages[0].objgen] = pdf.pages[0].objgen # Dummy remap
|
|
||||||
|
|
||||||
# Confirm we complain about the TOC and don't throw an exception
|
|
||||||
log = logging.getLogger()
|
|
||||||
_fix_toc(pdf, remap, log)
|
|
||||||
assert 'invalid table of contents entries' in caplog.text
|
|
||||||
|
|
||||||
|
|
||||||
def test_no_glyphless_weave(resources, outdir):
|
def test_no_glyphless_weave(resources, outdir):
|
||||||
pdf = pikepdf.open(resources / 'francais.pdf')
|
pdf = pikepdf.open(resources / 'francais.pdf')
|
||||||
pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
|
pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
|
||||||
@@ -62,3 +42,21 @@ def test_no_glyphless_weave(resources, outdir):
|
|||||||
'0',
|
'0',
|
||||||
env=env,
|
env=env,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.helpers.needs_pdfminer
|
||||||
|
def test_links(resources, outpdf):
|
||||||
|
check_ocrmypdf(
|
||||||
|
resources / 'link.pdf',
|
||||||
|
outpdf,
|
||||||
|
'--redo-ocr',
|
||||||
|
'--oversample',
|
||||||
|
'200',
|
||||||
|
'--output-type',
|
||||||
|
'pdf',
|
||||||
|
)
|
||||||
|
pdf = pikepdf.open(outpdf)
|
||||||
|
p1 = pdf.pages[0]
|
||||||
|
p2 = pdf.pages[1]
|
||||||
|
assert p1.Annots[0].A.D[0].objgen == p2.objgen
|
||||||
|
assert p2.Annots[0].A.D[0].objgen == p1.objgen
|
||||||
|
|||||||
Reference in New Issue
Block a user