Merge branch 'feature/misc-breaking'

This commit is contained in:
James R. Barlow
2021-04-01 16:51:04 -07:00
58 changed files with 109 additions and 172 deletions

View File

@@ -166,6 +166,7 @@ class GhostscriptFollower:
def generate_pdfa(
pdf_pages,
output_file: os.PathLike,
*,
compression: str,
pdf_version: str = '1.5',
pdfa_part: str = '2',
@@ -195,10 +196,11 @@ def generate_pdfa(
"-dAutoFilterGrayImages=true",
]
strategy = 'LeaveColorUnchanged'
# Older versions of Ghostscript expect a leading slash in
# sColorConversionStrategy, newer ones should not have it. See Ghostscript
# git commit fe1c025d.
strategy = 'RGB' if version() >= '9.19' else '/RGB'
strategy = ('/' + strategy) if version() < '9.19' else strategy
if version() == '9.23':
# 9.23: added JPEG passthrough as a new feature, but with a bug that

View File

@@ -221,6 +221,7 @@ def _generate_null_hocr(output_hocr, output_text, image):
def generate_hocr(
*,
input_file: Path,
output_hocr: Path,
output_text: Path,

View File

@@ -69,7 +69,7 @@ def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]:
def run(
input_file: Path, output_file: Path, dpi: DecFloat, mode_args: List[str]
input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: List[str]
) -> None:
args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args
@@ -114,6 +114,7 @@ def validate_custom_args(args: str) -> List[str]:
def clean(
input_file: Path,
output_file: Path,
*,
dpi: DecFloat,
unpaper_args: Optional[List[str]] = None,
):
@@ -130,4 +131,4 @@ def clean(
]
if not unpaper_args:
unpaper_args = default_args
run(input_file, output_file, dpi, unpaper_args)
run(input_file, output_file, dpi=dpi, mode_args=unpaper_args)

View File

@@ -6,22 +6,31 @@
import logging
import uuid
from contextlib import suppress
from pathlib import Path
from typing import Optional
import pikepdf
from pikepdf.objects import Dictionary, Name
log = logging.getLogger(__name__)
MAX_REPLACE_PAGES = 100
def _update_page_resources(*, page, font, font_key, procset):
"""Update this page's fonts with a reference to the Glyphless font"""
def _ensure_dictionary(obj, name):
if name not in obj:
obj[name] = pikepdf.Dictionary({})
return obj[name]
if '/Resources' not in page:
page['/Resources'] = pikepdf.Dictionary({})
resources = page['/Resources']
def _update_resources(*, obj, font, font_key, procset):
"""Update this obj's fonts with a reference to the Glyphless font.
obj can be a page or Form XObject.
"""
resources = _ensure_dictionary(obj, '/Resources')
try:
fonts = resources['/Font']
except KeyError:
@@ -32,7 +41,8 @@ def _update_page_resources(*, page, font, font_key, procset):
# Reassign /ProcSet to one that just lists everything - ProcSet is
# obsolete and doesn't matter but recommended for old viewer support
resources['/ProcSet'] = procset
if procset:
resources['/ProcSet'] = procset
def strip_invisible_text(pdf, page):
@@ -169,13 +179,13 @@ class OcrGrafter:
"""
page0 = self.pdf_base.pages[0]
_update_page_resources(
page=page0, font=self.font, font_key=self.font_key, procset=self.procset
_update_resources(
obj=page0, font=self.font, font_key=self.font_key, procset=self.procset
)
# We cannot read and write the same file, that will corrupt it
# but we don't to keep more copies than we need to. Delete intermediates.
# {interim_count} is the opened file we were updateing
# {interim_count} is the opened file we were updating
# {interim_count - 1} can be deleted
# {interim_count + 1} is the new file will produce and open
old_file = self.output_file.with_suffix(f'.working{self.interim_count - 1}.pdf')
@@ -210,6 +220,7 @@ class OcrGrafter:
pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
except (AttributeError, IndexError, KeyError):
return None, None
pdf_text_font = None
for f in possible_font_names:
pdf_text_font = pdf_text_fonts.get(f, None)
if pdf_text_font is not None:
@@ -279,17 +290,29 @@ class OcrGrafter:
# finally move the lower left corner to match the mediabox
ctm = translate @ rotate @ scale @ untranslate @ corner
pdf_text_contents = (
b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n'
base_resources = _ensure_dictionary(base_page, '/Resources')
base_xobjs = _ensure_dictionary(base_resources, '/XObject')
text_xobj_name = Name('/' + str(uuid.uuid4()))
xobj = self.pdf_base.make_stream(pdf_text_contents)
base_xobjs[text_xobj_name] = xobj
xobj.Type = Name.XObject
xobj.Subtype = Name.Form
xobj.FormType = 1
xobj.BBox = mediabox
_update_resources(
obj=xobj, font=font, font_key=font_key, procset=[Name.PDF]
)
new_text_layer = pikepdf.Stream(self.pdf_base, pdf_text_contents)
pdf_draw_xobj = (
(b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'\nQ\n'
)
new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj)
if strip_old_text:
strip_invisible_text(self.pdf_base, base_page)
base_page.page_contents_add(new_text_layer, prepend=True)
_update_page_resources(
page=base_page, font=font, font_key=font_key, procset=procset
_update_resources(
obj=base_page, font=font, font_key=font_key, procset=procset
)

View File

@@ -483,7 +483,12 @@ def preprocess_deskew(input_file: Path, page_context: PageContext):
def preprocess_clean(input_file: Path, page_context: PageContext):
output_file = page_context.get_path('pp_clean.png')
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
unpaper.clean(input_file, output_file, dpi.x, page_context.options.unpaper_args)
unpaper.clean(
input_file,
output_file,
dpi=dpi.x,
unpaper_args=page_context.options.unpaper_args,
)
return output_file
@@ -627,9 +632,9 @@ def render_hocr_page(hocr: Path, page_context: PageContext):
dpi = get_page_square_dpi(page_context.pageinfo, options)
debug_mode = options.pdf_renderer == 'hocrdebug'
hocrtransform = HocrTransform(hocr, dpi.x) # square
hocrtransform = HocrTransform(hocr_filename=hocr, dpi=dpi.x) # square
hocrtransform.to_pdf(
output_file,
out_filename=output_file,
image_filename=None,
show_bounding_boxes=False if not debug_mode else True,
invisible_text=True if not debug_mode else False,

View File

@@ -48,6 +48,7 @@ class Verbosity(IntEnum):
def configure_logging(
verbosity: Verbosity,
*,
progress_bar_friendly: bool = True,
manage_root_logger: bool = False,
plugin_manager=None,

View File

@@ -77,7 +77,7 @@ class HocrTransform:
{'': 'ff', '': 'ffi', '': 'ffl', '': 'fi', '': 'fl'}
)
def __init__(self, hocr_filename: Union[str, Path], dpi: float):
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float):
self.dpi = dpi
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
@@ -182,6 +182,7 @@ class HocrTransform:
def to_pdf(
self,
*,
out_filename: Path,
image_filename: Optional[Path] = None,
show_bounding_boxes: bool = False,
@@ -433,10 +434,10 @@ if __name__ == "__main__":
parser.add_argument('outputfile', help='Path to the PDF file to be generated')
args = parser.parse_args()
hocr = HocrTransform(args.hocrfile, args.resolution)
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution)
hocr.to_pdf(
args.outputfile,
args.image,
args.boundingboxes,
out_filename=args.outputfile,
image_filename=args.image,
show_bounding_boxes=args.boundingboxes,
interword_spaces=args.interword_spaces,
)

View File

@@ -34,7 +34,7 @@ from ocrmypdf._concurrent import Executor, SerialExecutor
from ocrmypdf._exec import jbig2enc, pngquant
from ocrmypdf._jobcontext import PdfContext
from ocrmypdf.exceptions import OutputFileAccessError
from ocrmypdf.helpers import deprecated, safe_symlink
from ocrmypdf.helpers import safe_symlink
log = logging.getLogger(__name__)
@@ -62,10 +62,6 @@ def jpg_name(root: Path, xref: Xref) -> Path:
return img_name(root, xref, '.jpg')
def tif_name(root: Path, xref: Xref) -> Path:
return img_name(root, xref, '.tif')
def extract_image_filter(
pike: Pdf, root: Path, image: Object, xref: Xref
) -> Optional[Tuple[PdfImage, Tuple[Name, Object]]]:
@@ -523,81 +519,6 @@ def transcode_pngs(
_transcode_png(pike, filename, xref)
@deprecated
def rewrite_png_as_g4(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover
im_obj.BitsPerComponent = 1
im_obj.Width = compdata.w
im_obj.Height = compdata.h
im_obj.write(compdata.read())
log.debug(f"PNG to G4 {im_obj.objgen}")
if Name.Predictor in im_obj:
del im_obj.Predictor
if Name.DecodeParms in im_obj:
del im_obj.DecodeParms
im_obj.DecodeParms = Dictionary(
K=-1, BlackIs1=bool(compdata.minisblack), Columns=compdata.w
)
im_obj.Filter = Name.CCITTFaxDecode
return
@deprecated
def rewrite_png(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover
# When a PNG is inserted into a PDF, we more or less copy the IDAT section from
# the PDF and transfer the rest of the PNG headers to PDF image metadata.
# One thing we have to do is tell the PDF reader whether a predictor was used
# on the image before Flate encoding. (Typically one is.)
# According to Leptonica source, PDF readers don't actually need us
# to specify the correct predictor, they just need a value of either:
# 1 - no predictor
# 10-14 - there is a predictor
# Leptonica's compdata->predictor only tells TRUE or FALSE
# 10-14 means the actual predictor is specified in the data, so for any
# number >= 10 the PDF reader will use whatever the PNG data specifies.
# In practice Leptonica should use Paeth, 14, but 15 seems to be the
# designated value for "optimal". So we will use 15.
# See:
# - PDF RM 7.4.4.4 Table 10
# - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757
predictor = 15 if compdata.predictor > 0 else 1
dparms = Dictionary(Predictor=predictor)
if predictor > 1:
dparms.BitsPerComponent = compdata.bps # Yes, this is redundant
dparms.Colors = compdata.spp
dparms.Columns = compdata.w
im_obj.BitsPerComponent = compdata.bps
im_obj.Width = compdata.w
im_obj.Height = compdata.h
log.debug(
f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}"
)
if compdata.ncolors > 0:
# .ncolors is the number of colors in the palette, not the number of
# colors used in a true color image. The palette string is always
# given as RGB tuples even when the image is grayscale; see
# https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067
palette_pdf_string = compdata.get_palette_pdf_string()
palette_data = pikepdf.Object.parse(palette_pdf_string)
palette_stream = pikepdf.Stream(pike, bytes(palette_data))
palette = [Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream]
cs = palette
else:
# ncolors == 0 means we are using a colorspace without a palette
if compdata.spp == 1:
cs = Name.DeviceGray
elif compdata.spp == 4:
cs = Name.DeviceCMYK
else: # spp == 3
cs = Name.DeviceRGB
im_obj.ColorSpace = cs
im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
def optimize(
input_file: Path,
output_file: Path,

View File

@@ -1,64 +1,44 @@
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__deu__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/francais.pdf", "args": ["-l", "deu", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/poster.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/poster.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__deu__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/francais.pdf", "args": ["-l", "deu", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/poster.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/poster.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003_rasterize_preview.jpg", "stdout"]}
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004_rasterize_preview.jpg", "stdout"]}

View File

@@ -53,8 +53,10 @@ def test_mono_image(blank_hocr, outdir):
im.putpixel((n, n), 1)
im.save(outdir / 'mono.tif', format='TIFF')
hocr = hocrtransform.HocrTransform(str(blank_hocr), 300)
hocr.to_pdf(str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif'))
hocr = hocrtransform.HocrTransform(hocr_filename=str(blank_hocr), dpi=300)
hocr.to_pdf(
out_filename=str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')
)
check_pdf(str(outdir / 'mono.pdf'))

View File

@@ -143,7 +143,7 @@ def test_multiple_pngs(resources, outdir):
outputstream=inpdf,
)
def mockquant(input_file, output_file, _quality_min, _quality_max):
def mockquant(input_file, output_file, *args):
with Image.open(input_file) as im:
draw = ImageDraw.Draw(im)
draw.rectangle((0, 0, im.width, im.height), fill=128)