mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-05 05:05:44 -04:00
Merge v6.1.5
This commit is contained in:
@@ -287,7 +287,7 @@ If you prefer to not modify your system in this matter, consider using a Docker
|
||||
sudo apt-get update
|
||||
|
||||
sudo apt-get install \
|
||||
python3.6 \
|
||||
python3.6-dev \
|
||||
ghostscript \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
@@ -298,7 +298,7 @@ Now we need to install ``pip`` and let it install ocrmypdf:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget -O - -o /dev/null https://bootstrap.pypa.io/get-pip.py | python3.6
|
||||
curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && python3.6 -m easy_install pip
|
||||
pip3.6 install ocrmypdf[fitz]
|
||||
|
||||
The ``wget`` command will download a program and run it.
|
||||
|
||||
@@ -9,10 +9,18 @@ The OCRmyPDF package itself does not contain a public API, although it is fairly
|
||||
find: [^`]\#([0-9]{1,3})[^0-9]
|
||||
replace: `#$1 <https://github.com/jbarlow83/OCRmyPDF/issues/$1>`_
|
||||
|
||||
v6.1.5
|
||||
------
|
||||
|
||||
- Fix issue `#253 <https://github.com/jbarlow83/OCRmyPDF/issues/248>`_, a possible division by zero when using the ``hocr`` renderer.
|
||||
|
||||
- Fix incorrectly formatted ``<xmp:ModifyDate>`` field inside XMP metadata for PDF/As. veraPDF flags this as a PDF/A validation failure. The error is caused the timezone and final digit of the seconds of modified time to be omitted, so at worst the modification time stamp is rounded to the nearest 10 seconds.
|
||||
|
||||
|
||||
v6.1.4
|
||||
------
|
||||
|
||||
- Fix issue #248, ``--clean`` argument may remove OCR from left column of text on certain documents. We now set ``--layout none`` to suppress this.
|
||||
- Fix issue `#248 <https://github.com/jbarlow83/OCRmyPDF/issues/248>`_ ``--clean`` argument may remove OCR from left column of text on certain documents. We now set ``--layout none`` to suppress this.
|
||||
|
||||
- The test cache was updated to reflect the change above.
|
||||
|
||||
|
||||
@@ -32,6 +32,18 @@ def version():
|
||||
return get_version('gs')
|
||||
|
||||
|
||||
def jpeg_passthrough_available():
|
||||
"""
|
||||
Ghostscript 9.23 introduced JPEG passthrough but it seems to corrupt the
|
||||
last two bytes of certain images, for now we disable it for 9.23 and
|
||||
do not mention it for < 9.23.
|
||||
|
||||
https://bugs.ghostscript.com/show_bug.cgi?id=699216
|
||||
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
def _gs_error_reported(stream):
|
||||
return re.search(r'error', stream, flags=re.IGNORECASE)
|
||||
|
||||
@@ -133,7 +145,8 @@ def generate_pdfa(pdf_pages, output_file, compression, log,
|
||||
strategy = 'RGB' if version() >= '9.19' else '/RGB'
|
||||
|
||||
if version() == '9.23':
|
||||
# 9.23: JPEG passthrough broken for image masks?
|
||||
# 9.23: new feature JPEG passthrough is broken in some cases, best to
|
||||
# disable it always
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=699216
|
||||
compression_args.append('-dPassThroughJPEGImages=false')
|
||||
|
||||
|
||||
@@ -322,8 +322,11 @@ class HocrTransform():
|
||||
dy = baseline_y2 - cursor[1]
|
||||
text.moveCursor(dx, dy)
|
||||
|
||||
text.setHorizScale(100 * box_width / font_width)
|
||||
text.textOut(elemtxt)
|
||||
# If reportlab tells us this word is 0 units wide, our best seems
|
||||
# to be to suppress this text
|
||||
if font_width > 0:
|
||||
text.setHorizScale(100 * box_width / font_width)
|
||||
text.textOut(elemtxt)
|
||||
pdf.drawText(text)
|
||||
|
||||
|
||||
|
||||
@@ -143,6 +143,7 @@ def decode_pdf_date(s: str) -> datetime:
|
||||
|
||||
if s.startswith('D:'):
|
||||
s = s[2:]
|
||||
s = s.replace("'", "") # Remove apos from PDF time strings
|
||||
for fmt in pdfmark_date_fmts:
|
||||
try:
|
||||
return datetime.strptime(s, fmt)
|
||||
|
||||
@@ -110,6 +110,7 @@ licensed under the specified license.
|
||||
- Wikipedia authors
|
||||
- CC-BY-SA 3.0
|
||||
* - missing_docinfo.pdf
|
||||
- @jbarlow83
|
||||
- @jbarlow83
|
||||
- PDF file with no /DocumentInfo section
|
||||
- CC-BY-SA 4.0
|
||||
@@ -117,7 +118,7 @@ licensed under the specified license.
|
||||
- PDF file generated by PDFPen pro that triggered content stream parse errors
|
||||
- @maxandersen
|
||||
- @maxandersen
|
||||
- MIT
|
||||
- CC-BY-SA 4.0
|
||||
* - negzero.pdf
|
||||
- copy of formxobject.pdf with token that qpdf doesn't like
|
||||
- @jbarlow83
|
||||
@@ -137,7 +138,7 @@ licensed under the specified license.
|
||||
- a PDF with vector art and text rendered as curves with no fonts
|
||||
- @Catscratch
|
||||
- @Catscratch
|
||||
- MIT
|
||||
- CC-BY-SA 4.0
|
||||
|
||||
|
||||
Assemblies
|
||||
|
||||
@@ -610,10 +610,6 @@ def test_masks(spoof_tesseract_noop, resources, outpdf):
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'masks.pdf', outpdf, env=spoof_tesseract_noop)
|
||||
|
||||
if ghostscript.version() == '9.23' and \
|
||||
p.returncode == ExitCode.invalid_output_pdf:
|
||||
pytest.xfail('https://bugs.ghostscript.com/show_bug.cgi?id=699216')
|
||||
|
||||
assert p.returncode == ExitCode.ok
|
||||
|
||||
|
||||
@@ -906,7 +902,7 @@ def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,
|
||||
if compression == "jpeg":
|
||||
assert pdfimage.enc == Encoding.jpeg
|
||||
else:
|
||||
if ghostscript.version() >= '9.23':
|
||||
if ghostscript.jpeg_passthrough_available():
|
||||
# Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be
|
||||
# copied without transcoding - so report
|
||||
if image.endswith('jpg'):
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
import pytest
|
||||
import PyPDF2 as pypdf
|
||||
import datetime
|
||||
from datetime import timezone
|
||||
|
||||
from ocrmypdf.pdfa import file_claims_pdfa, encode_pdf_date, decode_pdf_date
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
@@ -150,6 +151,6 @@ def test_creation_date_preserved(spoof_tesseract_noop, output_type, resources,
|
||||
# We expect that the modified date is quite recent
|
||||
date_after = decode_pdf_date(after['/ModDate'])
|
||||
assert seconds_between_dates(
|
||||
date_after, datetime.datetime.utcnow()) < 1000
|
||||
date_after, datetime.datetime.now(timezone.utc)) < 1000
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user