mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-19 12:04:44 -04:00
Merge branch 'feature/misc-breaking'
This commit is contained in:
@@ -166,6 +166,7 @@ class GhostscriptFollower:
|
||||
def generate_pdfa(
|
||||
pdf_pages,
|
||||
output_file: os.PathLike,
|
||||
*,
|
||||
compression: str,
|
||||
pdf_version: str = '1.5',
|
||||
pdfa_part: str = '2',
|
||||
@@ -195,10 +196,11 @@ def generate_pdfa(
|
||||
"-dAutoFilterGrayImages=true",
|
||||
]
|
||||
|
||||
strategy = 'LeaveColorUnchanged'
|
||||
# Older versions of Ghostscript expect a leading slash in
|
||||
# sColorConversionStrategy, newer ones should not have it. See Ghostscript
|
||||
# git commit fe1c025d.
|
||||
strategy = 'RGB' if version() >= '9.19' else '/RGB'
|
||||
strategy = ('/' + strategy) if version() < '9.19' else strategy
|
||||
|
||||
if version() == '9.23':
|
||||
# 9.23: added JPEG passthrough as a new feature, but with a bug that
|
||||
|
||||
@@ -221,6 +221,7 @@ def _generate_null_hocr(output_hocr, output_text, image):
|
||||
|
||||
|
||||
def generate_hocr(
|
||||
*,
|
||||
input_file: Path,
|
||||
output_hocr: Path,
|
||||
output_text: Path,
|
||||
|
||||
@@ -69,7 +69,7 @@ def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]:
|
||||
|
||||
|
||||
def run(
|
||||
input_file: Path, output_file: Path, dpi: DecFloat, mode_args: List[str]
|
||||
input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: List[str]
|
||||
) -> None:
|
||||
args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args
|
||||
|
||||
@@ -114,6 +114,7 @@ def validate_custom_args(args: str) -> List[str]:
|
||||
def clean(
|
||||
input_file: Path,
|
||||
output_file: Path,
|
||||
*,
|
||||
dpi: DecFloat,
|
||||
unpaper_args: Optional[List[str]] = None,
|
||||
):
|
||||
@@ -130,4 +131,4 @@ def clean(
|
||||
]
|
||||
if not unpaper_args:
|
||||
unpaper_args = default_args
|
||||
run(input_file, output_file, dpi, unpaper_args)
|
||||
run(input_file, output_file, dpi=dpi, mode_args=unpaper_args)
|
||||
|
||||
@@ -6,22 +6,31 @@
|
||||
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pikepdf
|
||||
from pikepdf.objects import Dictionary, Name
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
MAX_REPLACE_PAGES = 100
|
||||
|
||||
|
||||
def _update_page_resources(*, page, font, font_key, procset):
|
||||
"""Update this page's fonts with a reference to the Glyphless font"""
|
||||
def _ensure_dictionary(obj, name):
|
||||
if name not in obj:
|
||||
obj[name] = pikepdf.Dictionary({})
|
||||
return obj[name]
|
||||
|
||||
if '/Resources' not in page:
|
||||
page['/Resources'] = pikepdf.Dictionary({})
|
||||
resources = page['/Resources']
|
||||
|
||||
def _update_resources(*, obj, font, font_key, procset):
|
||||
"""Update this obj's fonts with a reference to the Glyphless font.
|
||||
|
||||
obj can be a page or Form XObject.
|
||||
"""
|
||||
|
||||
resources = _ensure_dictionary(obj, '/Resources')
|
||||
try:
|
||||
fonts = resources['/Font']
|
||||
except KeyError:
|
||||
@@ -32,7 +41,8 @@ def _update_page_resources(*, page, font, font_key, procset):
|
||||
|
||||
# Reassign /ProcSet to one that just lists everything - ProcSet is
|
||||
# obsolete and doesn't matter but recommended for old viewer support
|
||||
resources['/ProcSet'] = procset
|
||||
if procset:
|
||||
resources['/ProcSet'] = procset
|
||||
|
||||
|
||||
def strip_invisible_text(pdf, page):
|
||||
@@ -169,13 +179,13 @@ class OcrGrafter:
|
||||
"""
|
||||
|
||||
page0 = self.pdf_base.pages[0]
|
||||
_update_page_resources(
|
||||
page=page0, font=self.font, font_key=self.font_key, procset=self.procset
|
||||
_update_resources(
|
||||
obj=page0, font=self.font, font_key=self.font_key, procset=self.procset
|
||||
)
|
||||
|
||||
# We cannot read and write the same file, that will corrupt it
|
||||
# but we don't to keep more copies than we need to. Delete intermediates.
|
||||
# {interim_count} is the opened file we were updateing
|
||||
# {interim_count} is the opened file we were updating
|
||||
# {interim_count - 1} can be deleted
|
||||
# {interim_count + 1} is the new file will produce and open
|
||||
old_file = self.output_file.with_suffix(f'.working{self.interim_count - 1}.pdf')
|
||||
@@ -210,6 +220,7 @@ class OcrGrafter:
|
||||
pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
|
||||
except (AttributeError, IndexError, KeyError):
|
||||
return None, None
|
||||
pdf_text_font = None
|
||||
for f in possible_font_names:
|
||||
pdf_text_font = pdf_text_fonts.get(f, None)
|
||||
if pdf_text_font is not None:
|
||||
@@ -279,17 +290,29 @@ class OcrGrafter:
|
||||
# finally move the lower left corner to match the mediabox
|
||||
ctm = translate @ rotate @ scale @ untranslate @ corner
|
||||
|
||||
pdf_text_contents = (
|
||||
b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n'
|
||||
base_resources = _ensure_dictionary(base_page, '/Resources')
|
||||
base_xobjs = _ensure_dictionary(base_resources, '/XObject')
|
||||
text_xobj_name = Name('/' + str(uuid.uuid4()))
|
||||
xobj = self.pdf_base.make_stream(pdf_text_contents)
|
||||
base_xobjs[text_xobj_name] = xobj
|
||||
xobj.Type = Name.XObject
|
||||
xobj.Subtype = Name.Form
|
||||
xobj.FormType = 1
|
||||
xobj.BBox = mediabox
|
||||
_update_resources(
|
||||
obj=xobj, font=font, font_key=font_key, procset=[Name.PDF]
|
||||
)
|
||||
|
||||
new_text_layer = pikepdf.Stream(self.pdf_base, pdf_text_contents)
|
||||
pdf_draw_xobj = (
|
||||
(b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'\nQ\n'
|
||||
)
|
||||
new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj)
|
||||
|
||||
if strip_old_text:
|
||||
strip_invisible_text(self.pdf_base, base_page)
|
||||
|
||||
base_page.page_contents_add(new_text_layer, prepend=True)
|
||||
|
||||
_update_page_resources(
|
||||
page=base_page, font=font, font_key=font_key, procset=procset
|
||||
_update_resources(
|
||||
obj=base_page, font=font, font_key=font_key, procset=procset
|
||||
)
|
||||
|
||||
@@ -483,7 +483,12 @@ def preprocess_deskew(input_file: Path, page_context: PageContext):
|
||||
def preprocess_clean(input_file: Path, page_context: PageContext):
|
||||
output_file = page_context.get_path('pp_clean.png')
|
||||
dpi = get_page_square_dpi(page_context.pageinfo, page_context.options)
|
||||
unpaper.clean(input_file, output_file, dpi.x, page_context.options.unpaper_args)
|
||||
unpaper.clean(
|
||||
input_file,
|
||||
output_file,
|
||||
dpi=dpi.x,
|
||||
unpaper_args=page_context.options.unpaper_args,
|
||||
)
|
||||
return output_file
|
||||
|
||||
|
||||
@@ -627,9 +632,9 @@ def render_hocr_page(hocr: Path, page_context: PageContext):
|
||||
dpi = get_page_square_dpi(page_context.pageinfo, options)
|
||||
debug_mode = options.pdf_renderer == 'hocrdebug'
|
||||
|
||||
hocrtransform = HocrTransform(hocr, dpi.x) # square
|
||||
hocrtransform = HocrTransform(hocr_filename=hocr, dpi=dpi.x) # square
|
||||
hocrtransform.to_pdf(
|
||||
output_file,
|
||||
out_filename=output_file,
|
||||
image_filename=None,
|
||||
show_bounding_boxes=False if not debug_mode else True,
|
||||
invisible_text=True if not debug_mode else False,
|
||||
|
||||
@@ -48,6 +48,7 @@ class Verbosity(IntEnum):
|
||||
|
||||
def configure_logging(
|
||||
verbosity: Verbosity,
|
||||
*,
|
||||
progress_bar_friendly: bool = True,
|
||||
manage_root_logger: bool = False,
|
||||
plugin_manager=None,
|
||||
|
||||
@@ -77,7 +77,7 @@ class HocrTransform:
|
||||
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'}
|
||||
)
|
||||
|
||||
def __init__(self, hocr_filename: Union[str, Path], dpi: float):
|
||||
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float):
|
||||
self.dpi = dpi
|
||||
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
|
||||
|
||||
@@ -182,6 +182,7 @@ class HocrTransform:
|
||||
|
||||
def to_pdf(
|
||||
self,
|
||||
*,
|
||||
out_filename: Path,
|
||||
image_filename: Optional[Path] = None,
|
||||
show_bounding_boxes: bool = False,
|
||||
@@ -433,10 +434,10 @@ if __name__ == "__main__":
|
||||
parser.add_argument('outputfile', help='Path to the PDF file to be generated')
|
||||
args = parser.parse_args()
|
||||
|
||||
hocr = HocrTransform(args.hocrfile, args.resolution)
|
||||
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution)
|
||||
hocr.to_pdf(
|
||||
args.outputfile,
|
||||
args.image,
|
||||
args.boundingboxes,
|
||||
out_filename=args.outputfile,
|
||||
image_filename=args.image,
|
||||
show_bounding_boxes=args.boundingboxes,
|
||||
interword_spaces=args.interword_spaces,
|
||||
)
|
||||
|
||||
@@ -34,7 +34,7 @@ from ocrmypdf._concurrent import Executor, SerialExecutor
|
||||
from ocrmypdf._exec import jbig2enc, pngquant
|
||||
from ocrmypdf._jobcontext import PdfContext
|
||||
from ocrmypdf.exceptions import OutputFileAccessError
|
||||
from ocrmypdf.helpers import deprecated, safe_symlink
|
||||
from ocrmypdf.helpers import safe_symlink
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@@ -62,10 +62,6 @@ def jpg_name(root: Path, xref: Xref) -> Path:
|
||||
return img_name(root, xref, '.jpg')
|
||||
|
||||
|
||||
def tif_name(root: Path, xref: Xref) -> Path:
|
||||
return img_name(root, xref, '.tif')
|
||||
|
||||
|
||||
def extract_image_filter(
|
||||
pike: Pdf, root: Path, image: Object, xref: Xref
|
||||
) -> Optional[Tuple[PdfImage, Tuple[Name, Object]]]:
|
||||
@@ -523,81 +519,6 @@ def transcode_pngs(
|
||||
_transcode_png(pike, filename, xref)
|
||||
|
||||
|
||||
@deprecated
|
||||
def rewrite_png_as_g4(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover
|
||||
im_obj.BitsPerComponent = 1
|
||||
im_obj.Width = compdata.w
|
||||
im_obj.Height = compdata.h
|
||||
|
||||
im_obj.write(compdata.read())
|
||||
|
||||
log.debug(f"PNG to G4 {im_obj.objgen}")
|
||||
if Name.Predictor in im_obj:
|
||||
del im_obj.Predictor
|
||||
if Name.DecodeParms in im_obj:
|
||||
del im_obj.DecodeParms
|
||||
im_obj.DecodeParms = Dictionary(
|
||||
K=-1, BlackIs1=bool(compdata.minisblack), Columns=compdata.w
|
||||
)
|
||||
|
||||
im_obj.Filter = Name.CCITTFaxDecode
|
||||
return
|
||||
|
||||
|
||||
@deprecated
|
||||
def rewrite_png(pike: Pdf, im_obj: Object, compdata) -> None: # pragma: no cover
|
||||
# When a PNG is inserted into a PDF, we more or less copy the IDAT section from
|
||||
# the PDF and transfer the rest of the PNG headers to PDF image metadata.
|
||||
# One thing we have to do is tell the PDF reader whether a predictor was used
|
||||
# on the image before Flate encoding. (Typically one is.)
|
||||
# According to Leptonica source, PDF readers don't actually need us
|
||||
# to specify the correct predictor, they just need a value of either:
|
||||
# 1 - no predictor
|
||||
# 10-14 - there is a predictor
|
||||
# Leptonica's compdata->predictor only tells TRUE or FALSE
|
||||
# 10-14 means the actual predictor is specified in the data, so for any
|
||||
# number >= 10 the PDF reader will use whatever the PNG data specifies.
|
||||
# In practice Leptonica should use Paeth, 14, but 15 seems to be the
|
||||
# designated value for "optimal". So we will use 15.
|
||||
# See:
|
||||
# - PDF RM 7.4.4.4 Table 10
|
||||
# - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757
|
||||
predictor = 15 if compdata.predictor > 0 else 1
|
||||
dparms = Dictionary(Predictor=predictor)
|
||||
if predictor > 1:
|
||||
dparms.BitsPerComponent = compdata.bps # Yes, this is redundant
|
||||
dparms.Colors = compdata.spp
|
||||
dparms.Columns = compdata.w
|
||||
|
||||
im_obj.BitsPerComponent = compdata.bps
|
||||
im_obj.Width = compdata.w
|
||||
im_obj.Height = compdata.h
|
||||
|
||||
log.debug(
|
||||
f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}"
|
||||
)
|
||||
if compdata.ncolors > 0:
|
||||
# .ncolors is the number of colors in the palette, not the number of
|
||||
# colors used in a true color image. The palette string is always
|
||||
# given as RGB tuples even when the image is grayscale; see
|
||||
# https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067
|
||||
palette_pdf_string = compdata.get_palette_pdf_string()
|
||||
palette_data = pikepdf.Object.parse(palette_pdf_string)
|
||||
palette_stream = pikepdf.Stream(pike, bytes(palette_data))
|
||||
palette = [Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream]
|
||||
cs = palette
|
||||
else:
|
||||
# ncolors == 0 means we are using a colorspace without a palette
|
||||
if compdata.spp == 1:
|
||||
cs = Name.DeviceGray
|
||||
elif compdata.spp == 4:
|
||||
cs = Name.DeviceCMYK
|
||||
else: # spp == 3
|
||||
cs = Name.DeviceRGB
|
||||
im_obj.ColorSpace = cs
|
||||
im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
|
||||
|
||||
|
||||
def optimize(
|
||||
input_file: Path,
|
||||
output_file: Path,
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
108
tests/cache/manifest.jsonl
vendored
108
tests/cache/manifest.jsonl
vendored
@@ -1,64 +1,44 @@
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__deu__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/francais.pdf", "args": ["-l", "deu", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/poster.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "Darwin-18.7.0-x86_64-i386-64bit", "python": "3.7.7", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/poster.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__deu__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/francais.pdf", "args": ["-l", "deu", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__--psm__7__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_hocr__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/poster.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000001_ocr.png__000001_ocr_tess__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001_ocr.png", "$TMPDIR/000001_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000002_ocr.png__000002_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002_ocr.png", "$TMPDIR/000002_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/3small.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000003_ocr.png__000003_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003_ocr.png", "$TMPDIR/000003_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000004_ocr.png__000004_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004_ocr.png", "$TMPDIR/000004_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000005_ocr.png__000005_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005_ocr.png", "$TMPDIR/000005_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_hocr__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_hocr", "hocr", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__eng__000006_ocr.png__000006_ocr_tess__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006_ocr.png", "$TMPDIR/000006_ocr_tess", "pdf", "txt"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000001_rasterize_preview.jpg__stdout", "sourcefile": "resources/poster.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000002_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000003_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003_rasterize_preview.jpg", "stdout"]}
|
||||
{"tesseract_version": "4.1.1", "platform": "macOS-10.14.6-x86_64-i386-64bit", "python": "3.9.0", "argv_slug": "__-l__osd__--psm__0__000004_rasterize_preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004_rasterize_preview.jpg", "stdout"]}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -53,8 +53,10 @@ def test_mono_image(blank_hocr, outdir):
|
||||
im.putpixel((n, n), 1)
|
||||
im.save(outdir / 'mono.tif', format='TIFF')
|
||||
|
||||
hocr = hocrtransform.HocrTransform(str(blank_hocr), 300)
|
||||
hocr.to_pdf(str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif'))
|
||||
hocr = hocrtransform.HocrTransform(hocr_filename=str(blank_hocr), dpi=300)
|
||||
hocr.to_pdf(
|
||||
out_filename=str(outdir / 'mono.pdf'), image_filename=str(outdir / 'mono.tif')
|
||||
)
|
||||
|
||||
check_pdf(str(outdir / 'mono.pdf'))
|
||||
|
||||
|
||||
@@ -143,7 +143,7 @@ def test_multiple_pngs(resources, outdir):
|
||||
outputstream=inpdf,
|
||||
)
|
||||
|
||||
def mockquant(input_file, output_file, _quality_min, _quality_max):
|
||||
def mockquant(input_file, output_file, *args):
|
||||
with Image.open(input_file) as im:
|
||||
draw = ImageDraw.Draw(im)
|
||||
draw.rectangle((0, 0, im.width, im.height), fill=128)
|
||||
|
||||
Reference in New Issue
Block a user