Merge branch 'jbarlow83:master' into master

This commit is contained in:
Frank
2021-05-26 20:31:00 +02:00
396 changed files with 17047 additions and 9962 deletions

View File

@@ -1,16 +0,0 @@
# Coverage isn't really compatible with subprocesses so results are unreliable
[run]
branch = True
#concurrency = multiprocessing
source = ocrmypdf/
[report]
exclude_lines =
pragma: no cover
def __repr__
raise AssertionError
raise NotImplementedError
if 0:
if False:
if __name__ == .__main__.:

View File

@@ -1,16 +1,62 @@
# OCRmyPDF
#
FROM ubuntu:18.04
FROM ubuntu:20.04 as base
FROM base as builder
ENV LANG=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential autoconf automake libtool \
libleptonica-dev \
zlib1g-dev \
libexempi3 \
ocrmypdf \
python3-dev \
python3-distutils \
libffi-dev \
libqpdf-dev \
ca-certificates \
curl \
git
# Get the latest pip (Ubuntu version doesn't support manylinux2010)
RUN \
curl https://bootstrap.pypa.io/get-pip.py | python3
# Compile and install jbig2
# Needs libleptonica-dev, zlib1g-dev
RUN \
mkdir jbig2 \
&& curl -L https://github.com/agl/jbig2enc/archive/ea6a40a.tar.gz | \
tar xz -C jbig2 --strip-components=1 \
&& cd jbig2 \
&& ./autogen.sh && ./configure && make && make install \
&& cd .. \
&& rm -rf jbig2
COPY . /app
WORKDIR /app
RUN pip3 install --no-cache-dir \
-r requirements/main.txt \
-r requirements/webservice.txt \
-r requirements/test.txt \
-r requirements/watcher.txt \
.
FROM base
ENV LANG=C.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends \
ghostscript \
img2pdf \
liblept5 \
libsm6 libxext6 libxrender-dev \
zlib1g \
pngquant \
python3-pip \
python3-venv \
python3 \
qpdf \
tesseract-ocr \
tesseract-ocr-chi-sim \
tesseract-ocr-deu \
@@ -19,54 +65,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr-por \
tesseract-ocr-spa \
unpaper \
wget
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
ENV LANG=C.UTF-8
COPY --from=builder /usr/local/lib/ /usr/local/lib/
COPY --from=builder /usr/local/bin/ /usr/local/bin/
# Compile and install jbig2
# Needs libleptonica-dev, zlib1g-dev
RUN \
mkdir jbig2 \
&& wget -q https://github.com/agl/jbig2enc/archive/0.29.tar.gz -O - | \
tar xz -C jbig2 --strip-components=1 \
&& cd jbig2 \
&& ./autogen.sh && ./configure && make && make install \
&& cd .. \
&& rm -rf jbig2
COPY --from=builder /app/misc/webservice.py /app/
COPY --from=builder /app/misc/watcher.py /app/
RUN apt-get remove -y autoconf automake libtool
# Copy minimal project files to get the test suite.
COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/
COPY --from=builder /app/requirements /app/requirements
COPY --from=builder /app/tests /app/tests
RUN python3 -m venv --system-site-packages /appenv
# This installs the latest binary wheel instead of the code in the current
# folder. Installing from source will fail, apparently because cffi needs
# build-essentials (gcc) to do a source installation
# (i.e. "pip install ."). It's unclear to me why this is the case.
RUN . /appenv/bin/activate; \
pip install --upgrade pip \
&& pip install --upgrade ocrmypdf
# Now copy the application in, mainly to get the test suite.
# Do this now to make the best use of Docker cache.
COPY . /application
RUN . /appenv/bin/activate; \
pip install -r /application/requirements/test.txt
# Remove the junk, including the source version of application since it was
# already installed
RUN rm -rf /tmp/* /var/tmp/* /root/* /application/ocrmypdf \
&& apt-get remove -y build-essential \
&& apt-get autoremove -y \
&& apt-get autoclean -y
RUN useradd docker \
&& mkdir /home/docker \
&& chown docker:docker /home/docker
USER docker
WORKDIR /home/docker
# Must use array form of ENTRYPOINT
# Non-array form does not append other arguments, because that is "intuitive"
ENTRYPOINT ["/application/.docker/docker-wrapper.sh"]
ENTRYPOINT ["/usr/local/bin/ocrmypdf"]

View File

@@ -1,84 +0,0 @@
FROM alpine:3.9 as base
FROM base as builder
ENV LANG=C.UTF-8
RUN \
echo '@testing http://nl.alpinelinux.org/alpine/edge/testing' >> /etc/apk/repositories \
# Add runtime dependencies
&& apk add --update \
python3-dev \
py3-setuptools \
jbig2enc@testing \
ghostscript \
qpdf \
tesseract-ocr \
unpaper \
pngquant \
libxml2-dev \
libxslt-dev \
zlib-dev \
qpdf-dev \
libffi-dev \
leptonica-dev \
binutils \
# Install pybind11 for pikepdf
&& pip3 install pybind11 \
# Install flask for the webservice
&& pip3 install flask \
# Add build dependencies
&& apk add --virtual build-dependencies \
build-base \
git
COPY . /app
WORKDIR /app
RUN pip3 install .
FROM base
ENV LANG=C.UTF-8
RUN \
echo '@testing http://nl.alpinelinux.org/alpine/edge/testing' >> /etc/apk/repositories \
# Add runtime dependencies
&& apk add --update \
python3 \
jbig2enc@testing \
ghostscript \
qpdf \
tesseract-ocr \
tesseract-ocr-data-deu \
tesseract-ocr-data-chi_sim \
unpaper \
pngquant \
libxml2 \
libxslt \
zlib \
qpdf \
libffi \
leptonica-dev \
binutils \
&& mkdir /app
WORKDIR /app
# Copy build artifacts (python site-packages9
COPY --from=builder /usr/lib/python3.6/site-packages /usr/lib/python3.6/site-packages
COPY --from=builder /usr/bin/ocrmypdf /usr/bin/dumppdf.py /usr/bin/latin2ascii.py /usr/bin/pdf2txt.py /usr/bin/img2pdf /usr/bin/chardetect /usr/bin/
# Copy
COPY --from=builder /app/.docker/webservice.py /app/
# Copy minimal project files to get the test suite.
COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/
COPY --from=builder /app/requirements /app/requirements
COPY --from=builder /app/tests /app/tests
COPY --from=builder /app/src /app/src
# Copy PKG-INFO from build artifact in app dir to make setuptools-scm happy
RUN cp /usr/lib/python3.6/site-packages/ocrmypdf-*.egg-info/PKG-INFO /app
ENTRYPOINT ["/usr/bin/ocrmypdf"]

View File

@@ -1,5 +0,0 @@
#!/bin/bash
. /appenv/bin/activate
cd /home/docker
exec ocrmypdf "$@"

View File

@@ -1,17 +0,0 @@
# OCRmyPDF polyglot
#
FROM jbarlow83/ocrmypdf:latest
USER root
# Update system and install our dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
tesseract-ocr-all
RUN apt-get autoremove -y && apt-get clean -y
USER docker
# Must use array form of ENTRYPOINT
# Non-array form does not append other arguments, because that is "intuitive"
ENTRYPOINT ["/application/.docker/docker-wrapper.sh"]

View File

@@ -1,24 +0,0 @@
# OCRmyPDF webservice
#
FROM jbarlow83/ocrmypdf-polyglot:latest
USER root
# Update system and install our dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3-flask
RUN apt-get autoremove -y && apt-get clean -y
EXPOSE 5000
COPY .docker/webservice.py /application
USER docker
VOLUME ["/config"]
# This config file is optional
ENV OCRMYPDF_WEBSERVICE_SETTINGS "/config/config.py"
ENTRYPOINT ["python3", "/application/webservice.py"]

View File

@@ -1,26 +1,44 @@
# dotfiles
.*
!.coveragerc
!.dockerignore
!.git_archival.txt
!.gitattributes
!.gitignore
!.pre-commit-config.yaml
!.readthedocs.yml
# Dev scratch
*.ipynb
*.pdf
*.pyc
*.rst
*.sublime*
**/*.pyc
.*/
!.git/
!.docker/
.ruffus_history.sqlite
bin/
build/
docs/
dist/
htmlcov/
include/
lib/
MANIFEST.in
ocrmypdf.egg-info/
staging/
tests/cache/
tests/output/
/*.pdf
/*.qdf
/*.png
/scratch.py
IDEAS
log/
tests/resources/private/
tmp/
venv*/
/debug_tests.py
*.traineddata
/private
# Package building
*.egg-info/
build/
dist/
wheelhouse/
pip-wheel-metadata/
# Code coverage
htmlcov/
# Docker specific
bin/
docs/
include/
lib/
# Docker include .git/
!.git/

1
.gitattributes vendored
View File

@@ -9,5 +9,6 @@
*.png binary
*.jpg binary
*.bin binary
*.afdesign binary
.git_archival.txt export-subst

12
.github/FUNDING.yml vendored Normal file
View File

@@ -0,0 +1,12 @@
# These are supported funding model platforms
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: james-barlow
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

View File

@@ -0,0 +1,32 @@
---
name: General issues
about: Installation, packages, dependencies, "nothing works", test suite failures...
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
What's the problem?
**To Reproduce**
Steps to reproduce the behavior.
**Expected behavior**
What did you expected to happen?
**Screenshots**
If applicable, add screenshots to help explain your problem.
**System (please complete the following information):**
- OS:
- Python version:
- OCRmyPDF version:
**Installation**
How did you install OCRmyPDF? Did you install it from your operating system's
package manager, or using pip?
**Additional context**
Add any other context about the problem here.

View File

@@ -0,0 +1,40 @@
---
name: Problem with a specific input file
about: Something went wrong while trying to OCR a specific file
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**To Reproduce**
What command line or API call were you trying to run?
```bash
ocrmypdf ...arguments... input.pdf output.pdf
```
Run with verbosity or higher `-v1` to see more detailed logging. This information may be helpful.
**Example file**
If your issue is a problem that affects only certain files, and we will require an input file (PDF or image) that demonstrates your issue.
Please provide an input file with no personal or confidential information. At your option you may [GPG-encrypt the file](https://github.com/jbarlow83/OCRmyPDF/wiki) for OCRmyPDF's author only.
Links to files hosted elsewhere are perfectly acceptable. You could also look in ``tests/resources`` and see if any of those files reproduce your issue.
*(Issues without example files usually cannot be resolved. It's like reporting an issue against a web browser without providing a URL.)*
**Expected behavior**
A clear and concise description of what you expected to happen.
**Screenshots**
If applicable, add screenshots to help explain your problem.
**System**
- OS: [e.g. Linux, Windows, macOS]
- OCRmyPDF Version: ``ocrmypdf --version``
- How did you install ocrmypdf? Did you use a system package manager, `pip`, or a Docker image?

View File

@@ -0,0 +1,20 @@
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: ''
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context or screenshots about the feature request here.

View File

@@ -1,33 +0,0 @@
**Describe the issue**
A clear and concise description of what the issue is.
**To Reproduce**
What command line were you trying to run?
```bash
ocrmypdf ...arguments... input.pdf output.pdf
```
**Example file**
Please include an example *input* PDF (or image). The input file is more helpful.
Please check any or all that apply about the test file:
- [ ] This is the input file
- [ ] The file contains no personal or confidential information
- [ ] I am the copyright holder for this file
- [ ] I permit this file to be included in the OCRmyPDF test suite under the CC-BY-SA 4.0 license
- [ ] I am not the copyright holder, but this file is available under a free software license
Files that are not free for inclusion in this project are quite welcome, but we like to collect free files for our test suite when possible. Please do *not* submit files with confidential information. At your option you may encrypt files for OCRmyPDF's author only.
**Expected behavior**
A clear and concise description of what you expected to happen. Include screenshots if applicable.
**System:**
- OS: [e.g. Linux, macOS]
- OCRmyPDF Version: [e.g. v7.4.0]
**Additional context**
Add any other context about the problem here.

275
.github/workflows/build.yml vendored Normal file
View File

@@ -0,0 +1,275 @@
name: Test and deploy
on:
push:
branches:
- master
- ci
- release/*
tags:
- v*
paths-ignore:
- README*
pull_request:
jobs:
test_linux:
name: Test ${{ matrix.os }} with Python ${{ matrix.python }}
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-18.04] #, ubuntu-20.04]
python: ["3.6"] #, "3.7", "3.8", "3.9"]
env:
OS: ${{ matrix.os }}
PYTHON: ${{ matrix.python }}
steps:
- uses: actions/checkout@v2
with:
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
- uses: actions/setup-python@v2
name: Install Python
with:
python-version: ${{ matrix.python }}
- name: Install common packages
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
curl \
ghostscript \
img2pdf \
libffi-dev \
liblept5 \
libsm6 libxext6 libxrender-dev \
pngquant \
poppler-utils \
tesseract-ocr \
tesseract-ocr-deu \
tesseract-ocr-eng \
unpaper \
zlib1g
- name: Install Ubuntu 18.04 packages
if: matrix.os == 'ubuntu-18.04'
run: |
sudo apt-get install -y --no-install-recommends \
libexempi3
- name: Install Ubuntu 20.04 packages
if: matrix.os == 'ubuntu-20.04'
run: |
sudo apt-get install -y --no-install-recommends \
libexempi8
- name: Install Python packages
run: |
python -m pip install -r requirements/main.txt -r requirements/test.txt .
- name: Report versions
run: |
tesseract --version
gs --version
pngquant --version
unpaper --version
img2pdf --version
- name: Test
run: |
python -m pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
files: ./coverage.xml
env_vars: OS,PYTHON
test_macos:
name: Test macOS
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [macos-latest]
python: ["3.9"]
env:
OS: ${{ matrix.os }}
PYTHON: ${{ matrix.python }}
steps:
- uses: actions/checkout@v2
with:
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
- uses: actions/setup-python@v2
name: Install Python
with:
python-version: ${{ matrix.python }}
- name: Install Homebrew deps
run: |
brew update
brew install \
exempi \
ghostscript \
jbig2enc \
leptonica \
openjpeg \
pngquant \
tesseract
- name: Install Python packages
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements/main.txt -r requirements/test.txt .
- name: Report versions
run: |
tesseract --version
gs --version
pngquant --version
img2pdf --version
- name: Test
run: |
python -m pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
files: ./coverage.xml
env_vars: OS,PYTHON
test_windows:
name: Test Windows
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest]
python: ["3.9"]
env:
OS: ${{ matrix.os }}
PYTHON: ${{ matrix.python }}
steps:
- uses: actions/checkout@v2
with:
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
- uses: actions/setup-python@v2
name: Install Python
with:
python-version: ${{ matrix.python }}
- name: Install system packages
run: |
choco install --yes --no-progress --pre tesseract
choco install --yes --no-progress ghostscript
choco install --yes --no-progress pngquant
- name: Install Python packages
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements/main.txt -r requirements/test.txt .
- name: Test
run: |
python -m pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
files: ./coverage.xml
env_vars: OS,PYTHON
wheel_sdist_linux:
name: Build sdist and wheels
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
- uses: actions/setup-python@v2
name: Install Python
with:
python-version: "3.6"
- name: Make wheels and sdist
run: |
python -m pip install --upgrade pip wheel
python setup.py sdist
python setup.py bdist_wheel
- uses: actions/upload-artifact@v2
with:
path: |
./dist/*.whl
./dist/*.tar.gz
upload_pypi:
name: Deploy artifacts to PyPI
needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]
runs-on: ubuntu-latest
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
steps:
- uses: actions/download-artifact@v2
with:
name: artifact
path: dist
- uses: pypa/gh-action-pypi-publish@master
with:
user: __token__
password: ${{ secrets.TOKEN_PYPI }}
# repository_url: https://test.pypi.org/legacy/
docker:
name: Build Docker images
needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]
runs-on: ubuntu-latest
steps:
- name: Set image tag to release or branch
run: echo "DOCKER_IMAGE_TAG=${GITHUB_REF##*/}" >> $GITHUB_ENV
- name: If master, set to latest
run: echo 'DOCKER_IMAGE_TAG=latest' >> $GITHUB_ENV
if: env.DOCKER_IMAGE_TAG == 'master'
- name: Set Docker Hub repository to username
run: echo "DOCKER_REPOSITORY=jbarlow83" >> $GITHUB_ENV
- name: Set image name
run: echo "DOCKER_IMAGE_NAME=ocrmypdf" >> $GITHUB_ENV
- uses: actions/checkout@v2
with:
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
- name: Login to Docker Hub
uses: docker/login-action@v1
with:
username: jbarlow83
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
id: buildx
uses: docker/setup-buildx-action@v1
- name: Print image tag
run: echo "Building image ${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}"
- name: Build
run: |
docker buildx build \
--push \
--platform linux/arm64/v8,linux/amd64 \
--tag "${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" \
--file .docker/Dockerfile .

60
.gitignore vendored
View File

@@ -1,44 +1,44 @@
# Development environment
.bash_history
.pylintrc
.pytest_cache/
.ruffus_history.sqlite
.venv/
*.pyc
*.sublime-*
# dotfiles
.*
!.coveragerc
!.dockerignore
!.git_archival.txt
!.gitattributes
!.gitignore
!.pre-commit-config.yaml
!.readthedocs.yml
!.github/
# Dev scratch
*.ipynb
**/*.pyc
/*.pdf
/*.qdf
/*.png
/scratch.py
IDEAS
log/
tests/resources/private/
tmp/
venv*/
/debug_tests.py
*.traineddata
/private
/coverage.xml
# Package building
.eggs/
*.egg-info/
build/
dist/
wheelhouse/
pip-wheel-metadata/
# Code coverage
htmlcov/
# Automatically generated files
docs/_build/
docs/_static/
docs/_templates/
docs/Makefile
ocrmypdf/lib/_*.py
# Code coverage
.coverage*
htmlcov/
# Testing
.ipynb_checkpoints/
.vscode/
*.ipynb
*.profile
/*.pdf
/*.qdf
/*.png
/scratch.py
IDEAS
log/
tests/output/
tests/resources/private/
tmp/
/debug_tests.py
*.traineddata

View File

@@ -1,6 +1,23 @@
repos:
- repo: https://github.com/ambv/black
rev: stable
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.4.0
hooks:
- id: black
language_version: python3.7
- id: check-case-conflict
- id: check-merge-conflict
- id: check-toml
- id: check-yaml
- id: debug-statements
- repo: https://github.com/asottile/seed-isort-config
rev: v2.2.0
hooks:
- id: seed-isort-config
- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.7.0 # pick the isort version you'd like to use from https://github.com/pre-commit/mirrors-isort/releases
hooks:
- id: isort
- repo: https://github.com/psf/black
rev: 20.8b1
hooks:
- id: black
language_version: python
exclude: ^src/ocrmypdf/lib/_leptonica.py

View File

@@ -1,146 +0,0 @@
cache:
pip: true
directories:
- $HOME/Library/Caches/Homebrew
matrix:
include:
- os: linux
dist: trusty
sudo: required
language: python
python: "3.6"
env:
- DIST=trusty
addons: &trusty_apt
apt:
update: true
sources:
- sourceline: 'ppa:alex-p/tesseract-ocr'
- sourceline: 'ppa:heyarje/libav-11'
- sourceline: 'ppa:vshn/ghostscript'
packages:
- ghostscript
- libavcodec56
- libavformat56
- libavutil54
- libexempi3
- libffi-dev
- pngquant
- poppler-utils
- qpdf
- tesseract-ocr
- tesseract-ocr-deu
- tesseract-ocr-eng
- tesseract-ocr-fra
- os: linux
dist: xenial
sudo: required
language: python
python: "3.7"
env:
- DIST=xenial
addons:
apt:
update: true
sources:
- sourceline: 'ppa:alex-p/tesseract-ocr'
packages:
- ghostscript
- libexempi3
- libffi-dev
- pngquant
- poppler-utils
- qpdf
- tesseract-ocr
- tesseract-ocr-deu
- tesseract-ocr-eng
- tesseract-ocr-fra
- unpaper
- os: osx
osx_image: xcode9.2
language: generic
addons:
homebrew:
update: true
packages:
- exempi
- ghostscript
- jbig2enc
- leptonica
- openjpeg
- pngquant
- python
- qpdf
- tesseract
- unpaper
- os: osx
osx_image: xcode9.2
language: generic
env:
- ADD_PDFMINER=1
addons:
homebrew:
update: true
packages:
- exempi
- ghostscript
- jbig2enc
- leptonica
- openjpeg
- pngquant
- python
- qpdf
- tesseract
- unpaper
before_cache:
- rm -f $HOME/.cache/pip/log/debug.log
before_install: |
mkdir -p bin
if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
pip3 install --upgrade pip
pip3 install --upgrade wheel
if [[ "$DIST" == "trusty" ]]; then
mkdir -p packages
wget -q 'https://www.dropbox.com/s/vaq0kbwi6e6au80/unpaper_6.1-1.deb?raw=1' -O packages/unpaper_6.1-1.deb
sudo dpkg -i packages/unpaper_6.1-1.deb
fi
elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
pip3 install --upgrade pip
pip3 install wheel
fi
install:
- export PATH=$PWD/bin:$PATH
- pip3 install pycparser # py3.7 workaround for https://github.com/eliben/pycparser/issues/251
- pip3 install -r requirements/main.txt
- pip3 install --no-deps .
- |
if [[ "$ADD_PDFMINER" == "1" ]]; then
pip3 install --no-deps .[pdfminer]
fi
- pip3 install -r requirements/test.txt
script:
- tesseract --version
- qpdf --version
- pytest -n auto
deploy:
# release for main pypi
# 3.6 is considered the build leader and does the deploy, otherwise there is
# a race and all versions will try to deploy
# OTOH if we ever need separate binary wheels then each version needs its
# own deploy
- provider: pypi
user: ocrmypdf-travis
password:
secure: "DTFOmmNL6olA0+yXvp4u9jXZlZeqrJsJ0526jzqf4a3gZ6jnGTq5UI6WzRsslSyoMMfXKtHQebqHM6ogSgCZinyZ3ufHJo8fn9brxbEc2gsiWkbj5o3bGwdWMT1vNNE7XW0VCpw87rZ1EEwjl4FJHFudMlPR1yfU5+uq0k0PACo="
distributions: "sdist bdist_wheel"
on:
branch: master
tags: true
condition: $TRAVIS_PYTHON_VERSION == "3.6" && $TRAVIS_OS_NAME == "linux"
skip_upload_docs: true

1047
LICENSE
View File

File diff suppressed because it is too large Load Diff

View File

@@ -1,43 +0,0 @@
# requirements
recursive-include requirements *
# git
include .git_archival.txt
# docker
include .dockerignore
recursive-include .docker *
# tests
include .coveragerc
recursive-include tests *.bin
recursive-include tests *.jpg
recursive-include tests *.jsonl
recursive-include tests *.png
recursive-include tests *.pdf
recursive-include tests *.py
recursive-include tests *.rst
recursive-include tests *.txt
recursive-exclude tests/resources/private *
# documentation
include LICENSE
include *.rst
recursive-exclude .github *
recursive-include docs *.py
recursive-include docs *.rst
recursive-include docs *.svg
recursive-exclude docs/_build *
# support files
recursive-include src/ocrmypdf/data *
include *.py
exclude tasks.py
recursive-exclude .travis *
exclude .travis*
# code
exclude src/ocrmypdf/lib/_leptonica.py
exclude scratch.py

112
README.md
View File

@@ -1,15 +1,13 @@
OCRmyPDF
========
<img src="docs/images/logo.svg" width="240" alt="OCRmyPDF">
[![Travis build status][travis]](https://travis-ci.org/jbarlow83/OCRmyPDF) [![PyPI version][pypi]](https://pypi.org/project/ocrmypdf/) ![Homebrew version][homebrew] ![ReadTheDocs][docs]
[![Build Status](https://github.com/jbarlow83/OCRmyPDF/actions/workflows/build.yml/badge.svg)](https://github.com/jbarlow83/OCRmyPDF/actions/workflows/build.yml) [![PyPI version][pypi]](https://pypi.org/project/ocrmypdf/) ![Homebrew version][homebrew] ![ReadTheDocs][docs] ![Python versions][pyversions]
[azure]: https://dev.azure.com/jim0585/ocrmypdf/_apis/build/status/jbarlow83.OCRmyPDF?branchName=master
[travis]: https://travis-ci.org/jbarlow83/OCRmyPDF.svg?branch=master "Travis build status"
[pypi]: https://img.shields.io/pypi/v/ocrmypdf.svg "PyPI version"
[homebrew]: https://img.shields.io/homebrew/v/ocrmypdf.svg "Homebrew version"
[docs]: https://readthedocs.org/projects/ocrmypdf/badge/?version=latest "RTD"
[pyversions]: https://img.shields.io/pypi/pyversions/ocrmypdf "Supported Python versions"
OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched or copy-pasted.
@@ -27,15 +25,14 @@ ocrmypdf # it's a scriptable command line program
[See the release notes for details on the latest changes](https://ocrmypdf.readthedocs.io/en/latest/release_notes.html).
Main features
-------------
## Main features
- Generates a searchable [PDF/A](https://en.wikipedia.org/?title=PDF/A) file from a regular PDF
- Places OCR text accurately below the image to ease copy / paste
- Keeps the exact resolution of the original embedded images
- When possible, inserts OCR information as a "lossless" operation without disrupting any other content
- Optimizes PDF images, often producing files smaller than the input file
- If requested deskews and/or cleans the image before performing OCR
- If requested, deskews and/or cleans the image before performing OCR
- Validates input and output files
- Distributes work across all available CPU cores
- Uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) engine to recognize more than [100 languages](https://github.com/tesseract-ocr/tessdata)
@@ -44,10 +41,9 @@ Main features
For details: please consult the [documentation](https://ocrmypdf.readthedocs.io/en/latest/).
Motivation
----------
## Motivation
I searched the web for a free command line tool to OCR PDF files on Linux/UNIX: I found many, but none of them were really satisfying.
I searched the web for a free command line tool to OCR PDF files: I found many, but none of them were really satisfying:
- Either they produced PDF files with misplaced text under the image (making copy/paste impossible)
- Or they did not handle accents and multilingual characters
@@ -59,33 +55,23 @@ I searched the web for a free command line tool to OCR PDF files on Linux/UNIX:
...so I decided to develop my own tool.
Installation
------------
## Installation
Linux, UNIX, and macOS are supported. Windows is not directly supported but there is a Docker image available that runs on Windows.
Linux, Windows, macOS and FreeBSD are supported. Docker images are also available.
Users of Debian 9 or later or Ubuntu 16.10 or later may simply
```bash
apt-get install ocrmypdf
```
and users of Fedora 29 or later may simply
```bash
dnf install ocrmypdf
```
and macOS users with Homebrew may simply
```bash
brew install ocrmypdf
```
| Operating system | Install command |
| ----------------------------- | ------------------------------|
| Debian, Ubuntu | ``apt install ocrmypdf`` |
| Windows Subsystem for Linux | ``apt install ocrmypdf`` |
| Fedora | ``dnf install ocrmypdf`` |
| macOS | ``brew install ocrmypdf`` |
| LinuxBrew | ``brew install ocrmypdf`` |
| FreeBSD | ``pkg install py37-ocrmypdf`` |
| Conda | ``conda install ocrmypdf`` |
For everyone else, [see our documentation](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for installation steps.
Languages
---------
## Languages
OCRmyPDF uses Tesseract for OCR, and relies on its language packs. For Linux users, you can often find packages that provide language packs:
@@ -94,15 +80,20 @@ OCRmyPDF uses Tesseract for OCR, and relies on its language packs. For Linux use
apt-cache search tesseract-ocr
# Debian/Ubuntu users
apt-get install tesseract-ocr-chi-sim # Example: Install Chinese Simplified language back
apt-get install tesseract-ocr-chi-sim # Example: Install Chinese Simplified language pack
# Arch Linux users
pacman -S tesseract-data-eng tesseract-data-deu # Example: Install the English and German language packs
# brew macOS users
brew install tesseract-lang
```
You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.
Documentation and support
-------------------------
## Documentation and support
Once ocrmypdf is installed, the built-in help which explains the command syntax and options can be accessed via:
Once OCRmyPDF is installed, the built-in help which explains the command syntax and options can be accessed via:
```bash
ocrmypdf --help
@@ -110,42 +101,37 @@ ocrmypdf --help
Our [documentation is served on Read the Docs](https://ocrmypdf.readthedocs.io/en/latest/index.html).
If you detect an issue, please:
Please report issues on our [GitHub issues](https://github.com/jbarlow83/OCRmyPDF/issues) page, and follow the issue template for quick response.
- Check whether your issue is already known
- If no problem report exists on github, please create one here: <https://github.com/jbarlow83/OCRmyPDF/issues>
- Describe your problem thoroughly
- Append the console output of the script when running the debug mode (`-v 1` option)
- If possible provide your input PDF file as well as the content of the temporary folder (using a file sharing service like Dropbox)
## Requirements
Requirements
------------
In addition to the required Python version (3.6+), OCRmyPDF requires external program installations of Ghostscript, Tesseract OCR, QPDF, and Leptonica. OCRmyPDF is pure Python, but uses CFFI to portably generate library bindings. OCRmyPDF works on pretty much everything: Linux, macOS, Windows and FreeBSD.
Runs on CPython 3.5, 3.6 and 3.7. Requires external program installations of Ghostscript, Tesseract OCR, QPDF, and Leptonica. ocrmypdf is pure Python, but uses CFFI to portably generate library bindings.
Press & Media
-------------
## Press & Media
- [Going paperless with OCRmyPDF](https://medium.com/@ikirichenko/going-paperless-with-ocrmypdf-e2f36143f46a)
- [Converting a scanned document into a compressed searchable PDF with redactions](https://medium.com/@treyharris/converting-a-scanned-document-into-a-compressed-searchable-pdf-with-redactions-63f61c34fe4c)
- [c't 1-2014, page 59](http://heise.de/-2279695): Detailed presentation of OCRmyPDF v1.0 in the leading German IT magazine c't
- [heise Open Source, 09/2014: Texterkennung mit OCRmyPDF](http://heise.de/-2356670)
- [c't 1-2014, page 59](https://heise.de/-2279695): Detailed presentation of OCRmyPDF v1.0 in the leading German IT magazine c't
- [heise Open Source, 09/2014: Texterkennung mit OCRmyPDF](https://heise.de/-2356670)
- [heise Durchsuchbare PDF-Dokumente mit OCRmyPDF erstellen](https://www.heise.de/ratgeber/Durchsuchbare-PDF-Dokumente-mit-OCRmyPDF-erstellen-4607592.html)
- [Excellent Utilities: OCRmyPDF](https://www.linuxlinks.com/excellent-utilities-ocrmypdf-add-ocr-text-layer-scanned-pdfs/)
Business enquiries
------------------
## Business enquiries
OCRmyPDF would not be the software that it is today is without companies and users choosing to provide support for feature development and consulting enquiries. We are happy to discuss all enquiries, whether for extending the existing feature set, or integrating OCRmyPDF into a larger system.
OCRmyPDF would not be the software that it is today without companies and users choosing to provide support for feature development and consulting enquiries. We are happy to discuss all enquiries, whether for extending the existing feature set, or integrating OCRmyPDF into a larger system.
License
-------
## License
The OCRmyPDF software is licensed under the GNU GPLv3. Certain files are covered by other licenses, as noted in their source files.
The OCRmyPDF software is licensed under the Mozilla Public License 2.0
(MPL-2.0). This license permits integration of OCRmyPDF with other code,
included commercial and closed source, but asks you to publish source-level
modifications you make to OCRmyPDF.
The license for each test file varies, and is noted in tests/resources/README.rst. The documentation is licensed under Creative Commons Attribution-ShareAlike 4.0 (CC-BY-SA 4.0).
Some components of OCRmyPDF have other licenses, as noted in those files and the
``debian/copyright`` file. Most files in ``misc/`` use the MIT license, and the
documentation and test files are generally licensed under Creative Commons
ShareAlike 4.0 (CC-BY-SA 4.0).
OCRmyPDF versions prior to 6.0 were distributed under the MIT License.
Disclaimer
----------
## Disclaimer
The software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

743
debian/copyright vendored
View File

@@ -2,35 +2,70 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: OCRmyPDF
Upstream-Contact: James R. Barlow <barlow.jim@gmail.com>
Source: https://github.com/jbarlow83/OCRmyPDF
Files-Excluded: tests/resources/milk.pdf
Files: *
Copyright:
(C) 2013-2017 The OCRmyPDF Authors
(C) 2013-2016, 2015-2017 2016, 2017, 2017-2018, 2018 James R. Barlow
License: GPL-3+
(C) 2013-2015 Julien Pfefferkorn
(C) 2015-2020 James R. Barlow
(C) 2019 Martin Wind
License: MPL-2.0
Files: misc/*
Copyright:
(C) 2020 James R. Barlow
License: Expat
Files: misc/completion/ocrmypdf.bash
Copyright:
(C) 2019 Frank Pille
(C) 2020 Alex Willner
License: Expat
Files: misc/completion/ocrmypdf.fish
Copyright:
(C) 2020 James R. Barlow
License: Expat
Files: misc/batch.py
Copyright:
(C) 2016 findingorder: https://github.com/findingorder
License: Expat
Files: misc/synology.py
Copyright:
(C) github.com/Enantiomerie
License: Expat
Files: misc/watcher.py
Copyright:
(C) 2019 Ian Alexander: https://github.com/ianalexander
(C) 2020 James R. Barlow
License: Expat
Files: misc/webservice.py
Copyright: (C) 2019 James R. Barlow
License: AGPL-3+
Files: docs tests/resources/*
Copyright: (C) 2013-2018 James R. Barlow
License: CC-BY-SA-4.0
Files: docs/images/bitmap_vs_svg.svg
Copyright: (C) 2006 Yug
License: CC-BY-SA-2.5
Files: src/ocrmypdf/hocrtransform.py
Copyright: (C) 2010 Jonathan Brinley <jonathanbrinley@gmail.com>
(C) 2013-14 Julien Pfefferkorn
(C) 2015-16 James R. Barlow
License: Expat
Files: src/ocrmypdf/pdfa.py
Copyright: (C) 2015 James R. Barlow
(C) 1986-2017 The authors of GhostScript
License: GPL-3+
Files: src/ocrmypdf/_unicodefun.py
Copyright: (C) 2014 Armin Ronacher
(C) 2017 James R. Barlow
License: BSD-3-clause
Files: tests/spoof/*
Files: tests/plugins/*
Copyright: (C) 2016, 2017, 2016-2018 James R. Barlow
License: Expat
@@ -82,12 +117,17 @@ License: CC-BY-SA-3.0
Files: tests/resources/typewriter.png tests/resources/2400dpi.pdf
Copyright: (C) 2005 Ellywa
License: GFDL-1.2+ or CC-BY-SA-1.0 or CC-BY-SA-2.0 or CC-BY-SA-2.5 or CC-BY-SA-3.0
Comment:
Obtained from: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif
Files: tests/resources/overlay.pdf
Copyright: (C) 2017 Max Anderson
License: Expat
Files: tests/resources/baiona*.png
Files:
tests/resources/baiona*.png
tests/resources/baiona*.jpg
tests/resources/link.pdf
Copyright: (C) 2014 Euskaldunaa
License: CC-BY-SA-4.0
@@ -95,11 +135,12 @@ Files: tests/resources/vector.pdf
Copyright: (C) 2018 Catscratch
License: Expat
Files: test/resources/enron*.pdf
Copyright: EnronData.org
License: CC-BY-3.0
See: https://enrondata.readthedocs.io/en/latest/data/edo-enron-email-pst-dataset/
Comment: Unprocessed.
Files: tests/resources/3small.pdf
Copyright: (C) 2014 Euskaldunaa
(C) 2017 James R. Barlow
(C) 2005 Ellywa
License: CC-BY-SA-4.0 and (GFDL-1.2+ or CC-BY-SA-1.0 or CC-BY-SA-2.0 or CC-BY-SA-2.5 or CC-BY-SA-3.0)
Comment: concatenation of baiona_gray.png, crom.png and typewriter.png/2400dpi.pdf
Files: src/ocrmypdf/data/sRGB.icc
Copyright: Kai-Uwe Behrmann <www.behrmann.name>
@@ -113,6 +154,13 @@ Files: debian/*
Copyright: (C) 2016 Sean Whitton <spwhitton@spwhitton.name>
License: GPL-3+
License: MPL-2.0
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0.
.
On Debian systems the full text of the MPL-2.0 can be found in
/usr/share/common-licenses/MPL-2.0.
License: GPL-3+
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -130,6 +178,669 @@ License: GPL-3+
On Debian systems, the complete text of the GNU General
Public License version 3 can be found in "/usr/share/common-licenses/GPL-3".
License: AGPL-3+
GNU AFFERO GENERAL PUBLIC LICENSE
Version 3, 19 November 2007
.
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
.
Preamble
.
The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.
.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
our General Public Licenses are intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.
.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
.
Developers that use our General Public Licenses protect your rights
with two steps: (1) assert copyright on the software, and (2) offer
you this License which gives you legal permission to copy, distribute
and/or modify the software.
.
A secondary benefit of defending all users' freedom is that
improvements made in alternate versions of the program, if they
receive widespread use, become available for other developers to
incorporate. Many developers of free software are heartened and
encouraged by the resulting cooperation. However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.
.
The GNU Affero General Public License is designed specifically to
ensure that, in such cases, the modified source code becomes available
to the community. It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server. Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.
.
An older license, called the Affero General Public License and
published by Affero, was designed to accomplish similar goals. This is
a different license, not a version of the Affero GPL, but Affero has
released a new version of the Affero GPL which permits relicensing under
this license.
.
The precise terms and conditions for copying, distribution and
modification follow.
.
TERMS AND CONDITIONS
.
0. Definitions.
.
"This License" refers to version 3 of the GNU Affero General Public License.
.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
.
A "covered work" means either the unmodified Program or a work based
on the Program.
.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
.
1. Source Code.
.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
.
The Corresponding Source for a work in source code form is that
same work.
.
2. Basic Permissions.
.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
.
4. Conveying Verbatim Copies.
.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
.
5. Conveying Modified Source Versions.
.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
.
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
.
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
.
6. Conveying Non-Source Forms.
.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
.
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
.
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
.
7. Additional Terms.
.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
.
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
.
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
.
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
.
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
.
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
.
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
.
8. Termination.
.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
.
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
.
9. Acceptance Not Required for Having Copies.
.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
.
10. Automatic Licensing of Downstream Recipients.
.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
.
11. Patents.
.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
.
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
.
12. No Surrender of Others' Freedom.
.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
.
13. Remote Network Interaction; Use with the GNU General Public License.
.
Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software. This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.
.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the work with which it is combined will remain governed by version
3 of the GNU General Public License.
.
14. Revised Versions of this License.
.
The Free Software Foundation may publish revised and/or new versions of
the GNU Affero General Public License from time to time. Such new versions
will be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU Affero General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU Affero General Public License, you may choose any version ever published
by the Free Software Foundation.
.
If the Program specifies that a proxy can decide which future
versions of the GNU Affero General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
.
15. Disclaimer of Warranty.
.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
.
16. Limitation of Liability.
.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
.
17. Interpretation of Sections 15 and 16.
.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
.
END OF TERMS AND CONDITIONS
.
How to Apply These Terms to Your New Programs
.
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
.
Also add information on how to contact you by electronic and paper mail.
.
If your software can interact with users remotely through a computer
network, you should also make sure that it provides a way for users to
get its source. For example, if your program is a web application, its
interface could display a "Source" link that leads users to an archive
of the code. There are many ways you could offer source, and different
solutions will be better for different programs; see section 13 for the
specific requirements.
.
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see
<http://www.gnu.org/licenses/>.
License: Expat
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the

View File

@@ -1,16 +1,34 @@
=================
Advanced features
=================
Control of unpaper
------------------
==================
OCRmyPDF uses ``unpaper`` to provide the implementation of the ``--clean`` and ``--clean-final`` arguments. `unpaper <https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md>`_ provides a variety of image processing filters to improve images.
OCRmyPDF uses ``unpaper`` to provide the implementation of the
``--clean`` and ``--clean-final`` arguments.
`unpaper <https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md>`__
provides a variety of image processing filters to improve images.
By default, OCRmyPDF uses only ``unpaper`` arguments that were found to be safe to use on almost all files without having to inspect every page of the file afterwards. This is particularly true when only ``--clean`` is used, since that instructs OCRmyPDF to only clean the image before OCR and not the final image.
By default, OCRmyPDF uses only ``unpaper`` arguments that were found to
be safe to use on almost all files without having to inspect every page
of the file afterwards. This is particularly true when only ``--clean``
is used, since that instructs OCRmyPDF to only clean the image before
OCR and not the final image.
However, if you wish to use the more aggressive options in ``unpaper``, you may use ``--unpaper-args '...'`` to override the OCRmyPDF's defaults and forward other arguments to unpaper. This option will forward arguments to ``unpaper`` without any knowledge of what that program considers to be valid arguments. The string of arguments must be quoted as shown in the examples below. No filename arguments may be included. OCRmyPDF will assume it can append input and output filename of intermediate images to the ``--unpaper-args`` string.
However, if you wish to use the more aggressive options in ``unpaper``,
you may use ``--unpaper-args '...'`` to override the OCRmyPDF's defaults
and forward other arguments to unpaper. This option will forward
arguments to ``unpaper`` without any knowledge of what that program
considers to be valid arguments. The string of arguments must be quoted
as shown in the examples below. No filename arguments may be included.
OCRmyPDF will assume it can append input and output filename of
intermediate images to the ``--unpaper-args`` string.
In this example, we tell ``unpaper`` to expect two pages of text on a sheet (image), such as occurs when two facing pages of a book are scanned. ``unpaper`` uses this information to deskew each independently and clean up the margins of both.
In this example, we tell ``unpaper`` to expect two pages of text on a
sheet (image), such as occurs when two facing pages of a book are
scanned. ``unpaper`` uses this information to deskew each independently
and clean up the margins of both.
.. code-block:: bash
@@ -19,40 +37,71 @@ In this example, we tell ``unpaper`` to expect two pages of text on a sheet (ima
.. warning::
Some ``unpaper`` features will reposition text within the image. ``--clean-final`` is recommended to avoid this issue.
Some ``unpaper`` features will reposition text within the image.
``--clean-final`` is recommended to avoid this issue.
.. warning::
Some ``unpaper`` features cause multiple input or output files to be consumed or produced. OCRmyPDF requires ``unpaper`` to consume one file and produce one file. An deviation from that condition will result in errors.
Some ``unpaper`` features cause multiple input or output files to be
consumed or produced. OCRmyPDF requires ``unpaper`` to consume one
file and produce one file. An deviation from that condition will
result in errors.
.. note::
``unpaper`` uses uncompressed PBM/PGM/PPM files for its intermediate files. For large images or documents, it can take a lot of temporary disk space.
``unpaper`` uses uncompressed PBM/PGM/PPM files for its intermediate
files. For large images or documents, it can take a lot of temporary
disk space.
Control of OCR options
----------------------
======================
OCRmyPDF provides many features to control the behavior of the OCR engine, Tesseract.
OCRmyPDF provides many features to control the behavior of the OCR
engine, Tesseract.
When OCR is skipped
"""""""""""""""""""
-------------------
If a page in a PDF seems to have text, by default OCRmyPDF will exit without modifying the PDF. This is to ensure that PDFs that were previously OCRed or were "born digital" rather than scanned are not processed.
If a page in a PDF seems to have text, by default OCRmyPDF will exit
without modifying the PDF. This is to ensure that PDFs that were
previously OCRed or were "born digital" rather than scanned are not
processed.
If ``--skip-text`` is issued, then no OCR will be performed on pages that already have text. The page will be copied to the output. This may be useful for documents that contain both "born digital" and scanned content, or to use OCRmyPDF to normalize and convert to PDF/A regardless of their contents.
If ``--skip-text`` is issued, then no OCR will be performed on pages
that already have text. The page will be copied to the output. This may
be useful for documents that contain both "born digital" and scanned
content, or to use OCRmyPDF to normalize and convert to PDF/A regardless
of their contents.
If ``--redo-ocr`` is issued, then a detailed text analysis is performed. Text is categorized as either visible or invisible. Invisible text (OCR) is stripped out. Then an image of each page is created with visible text masked out. The page image is sent for OCR, and any additional text is inserted as OCR. If a file contains a mix of text and bitmap images that contain text, OCRmyPDF will locate the additional text in images without disrupting the existing text.
If ``--redo-ocr`` is issued, then a detailed text analysis is performed.
Text is categorized as either visible or invisible. Invisible text (OCR)
is stripped out. Then an image of each page is created with visible text
masked out. The page image is sent for OCR, and any additional text is
inserted as OCR. If a file contains a mix of text and bitmap images that
contain text, OCRmyPDF will locate the additional text in images without
disrupting the existing text.
If ``--force-ocr`` is issued, then all pages will be rasterized to images, discarding any hidden OCR text, and rasterizing any printable text. This is useful for redoing OCR, for fixing OCR text with a damaged character map (text is selectable but not searchable), and destroying redacted information. Any forms and vector graphics will be rasterized as well.
If ``--force-ocr`` is issued, then all pages will be rasterized to
images, discarding any hidden OCR text, and rasterizing any printable
text. This is useful for redoing OCR, for fixing OCR text with a damaged
character map (text is selectable but not searchable), and destroying
redacted information. Any forms and vector graphics will be rasterized
as well.
Time and image size limits
""""""""""""""""""""""""""
--------------------------
By default, OCRmyPDF permits tesseract to run for three minutes (180 seconds) per page. This is usually more than enough time to find all text on a reasonably sized page with modern hardware.
By default, OCRmyPDF permits tesseract to run for three minutes (180
seconds) per page. This is usually more than enough time to find all
text on a reasonably sized page with modern hardware.
If a page is skipped, it will be inserted without OCR. If preprocessing was requested, the preprocessed image layer will be inserted.
If a page is skipped, it will be inserted without OCR. If preprocessing
was requested, the preprocessed image layer will be inserted.
If you want to adjust the amount of time spent on OCR, change ``--tesseract-timeout``. You can also automatically skip images that exceed a certain number of megapixels with ``--skip-big``. (A 300 DPI, 8.5×11" page is 8.4 megapixels.)
If you want to adjust the amount of time spent on OCR, change
``--tesseract-timeout``. You can also automatically skip images that
exceed a certain number of megapixels with ``--skip-big``. (A 300 DPI,
8.5×11" page is 8.4 megapixels.)
.. code-block:: bash
@@ -60,21 +109,26 @@ If you want to adjust the amount of time spent on OCR, change ``--tesseract-time
ocrmypdf --tesseract-timeout 300 --skip-big 50 bigfile.pdf output.pdf
Overriding default tesseract
""""""""""""""""""""""""""""
----------------------------
OCRmyPDF checks the system ``PATH`` for the ``tesseract`` binary.
Some relevant environment variables that influence Tesseract's behavior include:
Some relevant environment variables that influence Tesseract's behavior
include:
.. envvar:: TESSDATA_PREFIX
Overrides the path to Tesseract's data files. This can allow simultaneous installation of the "best" and "fast" training data sets. OCRmyPDF does not manage this environment variable.
Overrides the path to Tesseract's data files. This can allow
simultaneous installation of the "best" and "fast" training data
sets. OCRmyPDF does not manage this environment variable.
.. envvar:: OMP_THREAD_LIMIT
Controls the number of threads Tesseract will use. OCRmyPDF will manage this environment if it is not already set. (Currently, it will set it to 1 because this gives the best results in testing.)
Controls the number of threads Tesseract will use. OCRmyPDF will
manage this environment variable if it is not already set.
For example, if you have a development build of Tesseract don't wish to use the system installation, you can launch OCRmyPDF as follows:
For example, if you have a development build of Tesseract don't wish to
use the system installation, you can launch OCRmyPDF as follows:
.. code-block:: bash
@@ -83,26 +137,34 @@ For example, if you have a development build of Tesseract don't wish to use the
TESSDATA_PREFIX=/home/user/src/tesseract \
ocrmypdf input.pdf output.pdf
In this example ``TESSDATA_PREFIX`` is required to redirect Tesseract to an alternate folder for its "tessdata" files.
In this example ``TESSDATA_PREFIX`` is required to redirect Tesseract to
an alternate folder for its "tessdata" files.
Overriding other support programs
"""""""""""""""""""""""""""""""""
---------------------------------
In addition to tesseract, OCRmyPDF uses the following external binaries:
* ``gs`` (Ghostscript)
* ``unpaper``
* ``qpdf``
In each case OCRmyPDF will search the ``PATH`` environment variable to locate the binaries.
- ``gs`` (Ghostscript)
- ``unpaper``
- ``pngquant``
- ``jbig2``
In each case OCRmyPDF will search the ``PATH`` environment variable to
locate the binaries.
Changing tesseract configuration variables
""""""""""""""""""""""""""""""""""""""""""
------------------------------------------
You can override tesseract's default `control parameters <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>`_ with a configuration file.
You can override tesseract's default `control
parameters <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>`__
with a configuration file.
As an example, this configuration will disable Tesseract's dictionary for current language. Normally the dictionary is helpful for interpolating words that are unclear, but it may interfere with OCR if the document does not contain many words (for example, a list of part numbers).
As an example, this configuration will disable Tesseract's dictionary
for current language. Normally the dictionary is helpful for
interpolating words that are unclear, but it may interfere with OCR if
the document does not contain many words (for example, a list of part
numbers).
Create a file named "no-dict.cfg" with these contents:
@@ -120,11 +182,11 @@ then run ocrmypdf as follows (along with any other desired arguments):
.. warning::
Some combinations of control parameters will break Tesseract or break assumptions that OCRmyPDF makes about Tesseract's output.
Some combinations of control parameters will break Tesseract or break
assumptions that OCRmyPDF makes about Tesseract's output.
Changing the PDF renderer
-------------------------
=========================
rasterizing
Converting a PDF to an image for display.
@@ -132,42 +194,63 @@ rasterizing
rendering
Creating a new PDF from other data (such as an existing PDF).
OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` always selects ``sandwich``.
OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The
renderer may be selected using ``--pdf-renderer``. The default is
``auto`` which lets OCRmyPDF select the renderer to use. Currently,
``auto`` always selects ``sandwich``.
The ``sandwich`` renderer
"""""""""""""""""""""""""
-------------------------
The ``sandwich`` renderer uses Tesseract's new text-only PDF feature, which produces a PDF page that lays out the OCR in invisible text. This page is then "sandwiched" onto the original PDF page, allowing lossless application of OCR even to PDF pages that contain other vector objects.
The ``sandwich`` renderer uses Tesseract's new text-only PDF feature,
which produces a PDF page that lays out the OCR in invisible text. This
page is then "sandwiched" onto the original PDF page, allowing lossless
application of OCR even to PDF pages that contain other vector objects.
Currently this is the best renderer for most uses, however it is implemented in Tesseract so OCRmyPDF cannot influence it. Currently some problematic PDF viewers like Mozilla PDF.js and macOS Preview have problems with segmenting its text output, and mightrunseveralwordstogether.
Currently this is the best renderer for most uses, however it is
implemented in Tesseract so OCRmyPDF cannot influence it. Currently some
problematic PDF viewers like Mozilla PDF.js and macOS Preview have
problems with segmenting its text output, and
mightrunseveralwordstogether.
When image preprocessing features like ``--deskew`` are used, the original PDF will be rendered as a full page and the OCR layer will be placed on top.
When image preprocessing features like ``--deskew`` are used, the
original PDF will be rendered as a full page and the OCR layer will be
placed on top.
The ``hocr`` renderer
"""""""""""""""""""""
---------------------
The ``hocr`` renderer works with older versions of Tesseract. The image layer is copied from the original PDF page if possible, avoiding potentially lossy transcoding or loss of other PDF information. If preprocessing is specified, then the image layer is a new PDF.
The ``hocr`` renderer works with older versions of Tesseract. The image
layer is copied from the original PDF page if possible, avoiding
potentially lossy transcoding or loss of other PDF information. If
preprocessing is specified, then the image layer is a new PDF.
Unlike ``sandwich`` this renderer is implemented within OCRmyPDF; anyone looking to customize how OCR is presented should look here. A major disadvantage of this renderer is it not capable of correctly handling text outside the Latin alphabet. Pull requests to improve the situation are welcome.
Unlike ``sandwich`` this renderer is implemented within OCRmyPDF; anyone
looking to customize how OCR is presented should look here. A major
disadvantage of this renderer is it not capable of correctly handling
text outside the Latin alphabet. Pull requests to improve the situation
are welcome.
Currently, this renderer has the best compatibility with Mozilla's PDF.js viewer.
Currently, this renderer has the best compatibility with Mozilla's
PDF.js viewer.
This works in all versions of Tesseract.
The ``tesseract`` renderer
""""""""""""""""""""""""""
--------------------------
The ``tesseract`` renderer was removed. OCRmyPDF's new approach to text layer grafting makes it functionally equivalent to ``sandwich``.
The ``tesseract`` renderer was removed. OCRmyPDF's new approach to text
layer grafting makes it functionally equivalent to ``sandwich``.
Return code policy
------------------
==================
OCRmyPDF writes all messages to ``stderr``. ``stdout`` is reserved for piping
output files. ``stdin`` is reserved for piping input files.
OCRmyPDF writes all messages to ``stderr``. ``stdout`` is reserved for
piping output files. ``stdin`` is reserved for piping input files.
The return codes generated by the OCRmyPDF are considered part of the stable
user interface. They may be imported from ``ocrmypdf.exceptions``.
The return codes generated by the OCRmyPDF are considered part of the
stable user interface. They may be imported from
``ocrmypdf.exceptions``.
.. list-table:: Return codes
:widths: 5 35 60
@@ -218,22 +301,44 @@ user interface. They may be imported from ``ocrmypdf.exceptions``.
Debugging the intermediate files
--------------------------------
================================
OCRmyPDF normally saves its intermediate results to a temporary folder and deletes this folder when it exits, whether it succeeded or failed.
OCRmyPDF normally saves its intermediate results to a temporary folder
and deletes this folder when it exits, whether it succeeded or failed.
If the ``-k`` argument is issued on the command line, OCRmyPDF will keep the temporary folder and print the location, whether it succeeded or failed (provided the Python interpreter did not crash). An example message is:
If the ``-k`` argument is issued on the command line, OCRmyPDF will keep
the temporary folder and print the location, whether it succeeded or
failed (provided the Python interpreter did not crash). An example
message is:
.. code-block:: none
Temporary working files saved at:
/tmp/com.github.ocrmypdf.u20wpz07
Temporary working files retained at:
/tmp/ocrmypdf.io.u20wpz07
The organization of this folder is an implementation detail and subject to change between releases. However the general organization is that working files on a per page basis have the page number as a prefix (starting with page 1), an infix indicates the processing stage, and a suffix indicates the file type. Some important files include:
The organization of this folder is an implementation detail and subject
to change between releases. However the general organization is that
working files on a per page basis have the page number as a prefix
(starting with page 1), an infix indicates the processing stage, and a
suffix indicates the file type. Some important files include:
* ``.page.png`` - what the input page looks like
* ``.image`` - the image we will show the user if we are in a mode that changes the final appearance; may be in one of several image formats
* ``.text.pdf`` - the OCR file; this will load as a blank page but should have visible text if checked with a tool like pdftotext or pdfminder.six
* ``.ocr.png`` - the file that is sent to Tesseract for OCR; depending on arguments this may differ from the presentation image
* ``layers.rendered.pdf`` - the composite PDF, before metadata repair and optimization
* ``images/*`` - images extracted during the optimization process; here the prefix indicates a PDF object ID not a page number
- ``_rasterize.png`` - what the input page looks like
- ``_ocr.png`` - the file that is sent to Tesseract for OCR; depending
on arguments this may differ from the presentation image
- ``_pp_deskew.png`` - the image, after deskewing
- ``_pp_clean.png`` - the image, after cleaning with unpaper
- ``_ocr_tess.pdf`` - the OCR file; appears as a blank page with invisible
text embedded
- ``_ocr_tess.txt`` - the OCR text (not necessarily all text on the page,
if the page is mixed format)
- ``fix_docinfo.pdf`` - a temporary file created to fix the PDF DocumentInfo
data structure
- ``graft_layers.pdf`` - the rendered PDF with OCR layers grafted on
- ``pdfa.pdf`` - ``graft_layers.pdf`` after conversion to PDF/A
- ``pdfa.ps`` - a PostScript file used by Ghostscript for PDF/A conversion
- ``optimize.pdf`` - the PDF generated before optimization
- ``optimize.out.pdf`` - the PDF generated by optimization
- ``origin`` - the input file
- ``origin.pdf`` - the input file or the input image converted to PDF
- ``images/*`` - images extracted during the optimization process; here
the prefix indicates a PDF object ID not a page number

117
docs/api.rst Normal file
View File

@@ -0,0 +1,117 @@
======================
Using the OCRmyPDF API
======================
OCRmyPDF originated as a command line program and continues to have this
legacy, but parts of it can be imported and used in other Python
applications.
Some applications may want to consider running ocrmypdf from a
subprocess call anyway, as this provides isolation of its activities.
Example
=======
OCRmyPDF one high-level function to run its main engine from an
application. The parameters are symmetric to the command line arguments
and largely have the same functions.
.. code-block:: python
import ocrmypdf
if __name__ == '__main__': # To ensure correct behavior on Windows and macOS
ocrmypdf.ocr('input.pdf', 'output.pdf', deskew=True)
With a few exceptions, all of the command line arguments are available
and may be passed as equivalent keywords.
A few differences are that ``verbose`` and ``quiet`` are not available.
Instead, output should be managed by configuring logging.
Parent process requirements
---------------------------
The :func:`ocrmypdf.ocr` function runs OCRmyPDF similar to command line
execution. To do this, it will:
- create a monitoring thread
- create worker processes (on Linux, forking itself; on Windows and macOS, by
spawning)
- manage the signal flags of its worker processes
- execute other subprocesses (forking and executing other programs)
The Python process that calls ``ocrmypdf.ocr()`` must be sufficiently
privileged to perform these actions.
There is no currently no option to manage how jobs are scheduled other
than the argument ``jobs=`` which will limit the number of worker
processes.
Creating a child process to call ``ocrmypdf.ocr()`` is suggested. That
way your application will survive and remain interactive even if
OCRmyPDF fails for any reason.
Programs that call ``ocrmypdf.ocr()`` should also install a SIGBUS signal
handler (except on Windows), to raise an exception if access to a memory
mapped file fails. OCRmyPDF may use memory mapping.
``ocrmypdf.ocr()`` will take a threading lock to prevent multiple runs of itself
in the same Python interpreter process. This is not thread-safe, because of how
OCRmyPDF's plugins and Python's library import system work. If you need to parallelize
OCRmyPDF, use processes.
.. warning::
On Windows and macOS, the script that calls ``ocrmypdf.ocr()`` must be
protected by an "ifmain" guard (``if __name__ == '__main__'``). If you do
not take at least one of these steps, process semantics will prevent
OCRmyPDF from working correctly.
Logging
-------
OCRmyPDF will log under loggers named ``ocrmypdf``. In addition, it
imports ``pdfminer`` and ``PIL``, both of which post log messages under
those logging namespaces.
You can configure the logging as desired for your application or call
:func:`ocrmypdf.configure_logging` to configure logging the same way
OCRmyPDF itself does. The command line parameters such as ``--quiet``
and ``--verbose`` have no equivalents in the API; you must use the
provided configuration function or do configuration in a way that suits
your use case.
Progress monitoring
-------------------
OCRmyPDF uses the ``tqdm`` package to implement its progress bars.
:func:`ocrmypdf.configure_logging` will set up logging output to
``sys.stderr`` in a way that is compatible with the display of the
progress bar. Use ``ocrmypdf.ocr(...progress_bar=False)`` to disable
the progress bar.
Exceptions
----------
OCRmyPDF may throw standard Python exceptions, ``ocrmypdf.exceptions.*``
exceptions, some exceptions related to multiprocessing, and
``KeyboardInterrupt``. The parent process should provide an exception
handler. OCRmyPDF will clean up its temporary files and worker processes
automatically when an exception occurs.
Programs that call OCRmyPDF should consider trapping KeyboardInterrupt
so that they allow OCR to terminate with the whole program terminating.
When OCRmyPDF succeeds conditionally, it returns an integer exit code.
Reference
---------
.. autofunction:: ocrmypdf.ocr
.. autoclass:: ocrmypdf.Verbosity
:members:
:undoc-members:
.. autofunction:: ocrmypdf.configure_logging

55
docs/apiref.rst Normal file
View File

@@ -0,0 +1,55 @@
=============
API Reference
=============
This page summarizes the rest of the public API. Generally speaking this
should mainly of interest to plugin developers.
ocrmypdf
========
.. autoclass:: ocrmypdf.PageContext
:members:
.. autoclass:: ocrmypdf.PdfContext
:members:
ocrmypdf.exceptions
===================
.. automodule:: ocrmypdf.exceptions
:members:
:undoc-members:
ocrmypdf.helpers
================
.. automodule:: ocrmypdf.helpers
:members:
:noindex: deprecated
.. autodecorator:: deprecated
ocrmypdf.hocrtransform
======================
.. automodule:: ocrmypdf.hocrtransform
:members:
ocrmypdf.pdfa
=============
.. automodule:: ocrmypdf.pdfa
:members:
ocrmypdf.quality
================
.. automodule:: ocrmypdf.quality
:members:
ocrmypdf.subprocess
===================
.. automodule:: ocrmypdf.subprocess
:members:

View File

@@ -1,225 +1,217 @@
================
Batch processing
================
This article provides information about running OCRmyPDF on multiple files or configuring it as a service triggered by file system events.
This article provides information about running OCRmyPDF on multiple
files or configuring it as a service triggered by file system events.
Batch jobs
----------
==========
Consider using the excellent `GNU Parallel <https://www.gnu.org/software/parallel/>`_ to apply OCRmyPDF to multiple files at once.
Consider using the excellent `GNU
Parallel <https://www.gnu.org/software/parallel/>`__ to apply OCRmyPDF
to multiple files at once.
Both ``parallel`` and ``ocrmypdf`` will try to use all available processors. To maximize parallelism without overloading your system with processes, consider using ``parallel -j 2`` to limit parallel to running two jobs at once.
Both ``parallel`` and ``ocrmypdf`` will try to use all available
processors. To maximize parallelism without overloading your system with
processes, consider using ``parallel -j 2`` to limit parallel to running
two jobs at once.
This command will run all ocrmypdf all files named ``*.pdf`` in the current directory and write them to the previous created ``output/`` folder. It will not search subdirectories.
This command will run all ocrmypdf all files named ``*.pdf`` in the
current directory and write them to the previous created ``output/``
folder. It will not search subdirectories.
The ``--tag`` argument tells parallel to print the filename as a prefix whenever a message is printed, so that one can trace any errors to the file that produced them.
The ``--tag`` argument tells parallel to print the filename as a prefix
whenever a message is printed, so that one can trace any errors to the
file that produced them.
.. code-block:: bash
parallel --tag -j 2 ocrmypdf '{}' 'output/{}' ::: *.pdf
parallel --tag -j 2 ocrmypdf '{}' 'output/{}' ::: *.pdf
OCRmyPDF automatically repairs PDFs before parsing and gathering information from them.
OCRmyPDF automatically repairs PDFs before parsing and gathering
information from them.
Directory trees
---------------
===============
This will walk through a directory tree and run OCR on all files in place, printing the output in a way that makes
This will walk through a directory tree and run OCR on all files in
place, printing the output in a way that makes
.. code-block:: bash
find . -printf '%p' -name '*.pdf' -exec ocrmypdf '{}' '{}' \;
Alternatively, with a docker container (mounts a volume to the container where the PDFs are stored):
find . -printf '%p' -name '*.pdf' -exec ocrmypdf '{}' '{}' \;
Alternatively, with a docker container (mounts a volume to the container
where the PDFs are stored):
.. code-block:: bash
find . -printf '%p' -name '*.pdf' -exec docker run --rm -v <host dir>:<container dir> jbarlow83/ocrmypdf-alpine '<container dir>/{}' '<container dir>/{}' \;
find . -printf '%p' -name '*.pdf' -exec docker run --rm -v <host dir>:<container dir> jbarlow83/ocrmypdf '<container dir>/{}' '<container dir>/{}' \;
This only runs one ``ocrmypdf`` process at a time. This variation uses ``find`` to create a directory list and ``parallel`` to parallelize runs of ``ocrmypdf``, again updating files in place.
This only runs one ``ocrmypdf`` process at a time. This variation uses
``find`` to create a directory list and ``parallel`` to parallelize runs
of ``ocrmypdf``, again updating files in place.
.. code-block:: bash
find . -name '*.pdf' | parallel --tag -j 2 ocrmypdf '{}' '{}'
find . -name '*.pdf' | parallel --tag -j 2 ocrmypdf '{}' '{}'
In a Windows batch file, use
.. code-block:: bat
for /r %%f in (*.pdf) do ocrmypdf %%f %%f
Sample script
"""""""""""""
-------------
This user contributed script also provides an example of batch processing.
.. code-block:: python
#!/usr/bin/env python3
# Walk through directory tree, replacing all files with OCR'd version
# Contributed by DeliciousPickle@github
import logging
import os
import subprocess
import sys
script_dir = os.path.dirname(os.path.realpath(__file__))
print(script_dir + '/ocr-tree.py: Start')
if len(sys.argv) > 1:
start_dir = sys.argv[1]
else:
start_dir = '.'
if len(sys.argv) > 2:
log_file = sys.argv[2]
else:
log_file = script_dir + '/ocr-tree.log'
logging.basicConfig(
level=logging.INFO, format='%(asctime)s %(message)s',
filename=log_file, filemode='w')
for dir_name, subdirs, file_list in os.walk(start_dir):
logging.info('\n')
logging.info(dir_name + '\n')
os.chdir(dir_name)
for filename in file_list:
file_ext = os.path.splitext(filename)[1]
if file_ext == '.pdf':
full_path = dir_name + '/' + filename
print(full_path)
cmd = ["ocrmypdf", "--deskew", filename, filename]
logging.info(cmd)
proc = subprocess.run(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
result = proc.stdout
if proc.returncode == 6:
print("Skipped document because it already contained text")
elif proc.returncode == 0:
print("OCR complete")
logging.info(result)
API
"""
OCRmyPDF is currently supported as a command line interface. This means that even if you are using OCRmyPDF in a Python script, you should run it in a subprocess rather importing the ocrmypdf package.
The reason for this limitation is that the `ruffus <https://github.com/bunbun/ruffus/>`_ library that OCRmyPDF depends on is unfortunately not reentrant. OCRmyPDF works by defining each operation it does as a ruffus task that takes one or more files as input and generates one or more files as output. As such ruffus is fairly fundamental.
(If you find individual functions implemented in OCRmyPDF useful (such as ``ocrmypdf.pdfinfo``), you can use these if you wish to.)
This user contributed script also provides an example of batch
processing.
.. literalinclude:: ../misc/batch.py
:caption: misc/batch.py
Synology DiskStations
"""""""""""""""""""""
Synology DiskStations (Network Attached Storage devices) can run the Docker image of OCRmyPDF if the Synology `Docker package <https://www.synology.com/en-global/dsm/packages/Docker>`_ is installed. Attached is a script to address particular quirks of using OCRmyPDF on one of these devices.
This is only possible for x86-based Synology products. Some Synology products use ARM or Power processors and do not support Docker. Further adjustments might be needed to deal with the Synology's relatively limited CPU and RAM.
.. code-block:: python
#!/bin/env python3
# Contributed by github.com/Enantiomerie
# script needs 2 arguments
# 1. source dir with *.pdf - default is location of script
# 2. move dir where *.pdf and *_OCR.pdf are moved to
import logging
import os
import subprocess
import sys
import time
import shutil
script_dir = os.path.dirname(os.path.realpath(__file__))
timestamp = time.strftime("%Y-%m-%d-%H%M_")
log_file = script_dir + '/' + timestamp + 'ocrmypdf.log'
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', filename=log_file, filemode='w')
if len(sys.argv) > 1:
start_dir = sys.argv[1]
else:
start_dir = '.'
for dir_name, subdirs, file_list in os.walk(start_dir):
logging.info('\n')
logging.info(dir_name + '\n')
os.chdir(dir_name)
for filename in file_list:
file_ext = os.path.splitext(filename)[1]
if file_ext == '.pdf':
full_path = dir_name + '/' + filename
file_noext = os.path.splitext(filename)[0]
timestamp_OCR = time.strftime("%Y-%m-%d-%H%M_OCR_")
filename_OCR = timestamp_OCR + file_noext + '.pdf'
docker_mount = dir_name + ':/home/docker'
# create string for pdf processing
# diskstation needs a user:group docker:docker. find uid:gid of your diskstation docker:docker with id docker.
# use this uid:gid in -u flag
# rw rights for docker:docker at source dir are also necessary
# the script is processed as root user via chron
cmd = ['docker', 'run', '--rm', '-v', docker_mount, '-u=1030:65538', 'jbarlow83/ocrmypdf', , '--deskew' , filename, filename_OCR]
logging.info(cmd)
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
result = proc.stdout.read()
logging.info(result)
full_path_OCR = dir_name + '/' + filename_OCR
os.chmod(full_path_OCR, 0o666)
os.chmod(full_path, 0o666)
full_path_OCR_archive = sys.argv[2]
full_path_archive = sys.argv[2] + '/no_ocr'
shutil.move(full_path_OCR,full_path_OCR_archive)
shutil.move(full_path, full_path_archive)
logging.info('Finished.\n')
Huge batch jobs
"""""""""""""""
If you have thousands of files to work with, contact the author. Consulting work related to OCRmyPDF helps fund this open source project and all inquiries are appreciated.
Hot (watched) folders
---------------------
To set up a "hot folder" that will trigger OCR for every file inserted, use a program like Python `watchdog <https://pypi.python.org/pypi/watchdog>`_ (supports all major OS).
Synology DiskStations (Network Attached Storage devices) can run the
Docker image of OCRmyPDF if the Synology `Docker
package <https://www.synology.com/en-global/dsm/packages/Docker>`__ is
installed. Attached is a script to address particular quirks of using
OCRmyPDF on one of these devices.
One could then configure a scanner to automatically place scanned files in a hot folder, so that they will be queued for OCR and copied to the destination.
This is only possible for x86-based Synology products. Some Synology
products use ARM or Power processors and do not support Docker. Further
adjustments might be needed to deal with the Synology's relatively
limited CPU and RAM.
.. code-block:: bash
.. literalinclude:: ../misc/synology.py
:caption: misc/synology.py - Sample script for Synology DiskStations
pip install watchdog
watchdog installs the command line program ``watchmedo``, which can be told to run ``ocrmypdf`` on any .pdf added to the current directory (``.``) and place the result in the previously created ``out/`` folder.
.. code-block:: bash
cd hot-folder
mkdir out
watchmedo shell-command \
--patterns="*.pdf" \
--ignore-directories \
--command='ocrmypdf "${watch_src_path}" "out/${watch_src_path}" ' \
. # don't forget the final dot
For more complex behavior you can write a Python script around to use the watchdog API.
On file servers, you could configure watchmedo as a system service so it will run all the time.
Caveats
"""""""
* ``watchmedo`` may not work properly on a networked file system, depending on the capabilities of the file system client and server.
* This simple recipe does not filter for the type of file system event, so file copies, deletes and moves, and directory operations, will all be sent to ocrmypdf, producing errors in several cases. Disable your watched folder if you are doing anything other than copying files to it.
* If the source and destination directory are the same, watchmedo may create an infinite loop.
* On BSD, FreeBSD and older versions of macOS, you may need to increase the number of file descriptors to monitor more files, using ``ulimit -n 1024`` to watch a folder of up to 1024 files.
Alternatives
""""""""""""
* `Watchman <https://facebook.github.io/watchman/>`_ is a more powerful alternative to ``watchmedo``.
macOS Automator
Huge batch jobs
---------------
You can use the Automator app with macOS, to create a Workflow or Quick Action. Use a *Run Shell Script* action in your workflow. In the context of Automator, the ``PATH`` may be set differently your Terminal's ``PATH``; you may need to explicitly set the PATH to include ``ocrmypdf``. The following example may serve as a starting point:
If you have thousands of files to work with, contact the author.
Consulting work related to OCRmyPDF helps fund this open source project
and all inquiries are appreciated.
.. image:: images/macos-workflow.png
:alt: Example macOS Automator script
Hot (watched) folders
=====================
Watched folders with watcher.py
-------------------------------
OCRmyPDF has a folder watcher called watcher.py, which is currently included in source
distributions but not part of the main program. It may be used natively or may run
in a Docker container. Native instances tend to give better performance. watcher.py
works on all platforms.
Users may need to customize the script to meet their requirements.
.. code-block:: bash
pip3 install -r requirements/watcher.txt
env OCR_INPUT_DIRECTORY=/mnt/input-pdfs \
OCR_OUTPUT_DIRECTORY=/mnt/output-pdfs \
OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
python3 watcher.py
.. csv-table:: watcher.py environment variables
:header: "Environment variable", "Description"
:widths: 50, 50
"OCR_INPUT_DIRECTORY", "Set input directory to monitor (recursive)"
"OCR_OUTPUT_DIRECTORY", "Set output directory (should not be under input)"
"OCR_ON_SUCCESS_DELETE", "This will delete the input file if the exit code is 0 (OK)"
"OCR_OUTPUT_DIRECTORY_YEAR_MONTH", "This will place files in the output in ``{output}/{year}/{month}/{filename}``"
"OCR_DESKEW", "Apply deskew to crooked input PDFs"
"OCR_JSON_SETTINGS", "A JSON string specifying any other arguments for ``ocrmypdf.ocr``, e.g. ``'OCR_JSON_SETTINGS={""rotate_pages"": true}'``."
"OCR_POLL_NEW_FILE_SECONDS", "Polling interval"
"OCR_LOGLEVEL", "Level of log messages to report"
One could configure a networked scanner or scanning computer to drop files in the
watched folder.
Watched folders with Docker
---------------------------
The watcher service is included in the OCRmyPDF Docker image. To run it:
.. code-block:: bash
docker run \
-v <path to files to convert>:/input \
-v <path to store results>:/output \
-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
-e OCR_ON_SUCCESS_DELETE=1 \
-e OCR_DESKEW=1 \
-e PYTHONUNBUFFERED=1 \
-it --entrypoint python3 \
jbarlow83/ocrmypdf \
watcher.py
This service will watch for a file that matches ``/input/\*.pdf`` and will
convert it to a OCRed PDF in ``/output/``. The parameters to this image are:
.. csv-table:: watcher.py parameters for Docker
:header: "Parameter", "Description"
:widths: 50, 50
"``-v <path to files to convert>:/input``", "Files placed in this location will be OCRed"
"``-v <path to store results>:/output``", "This is where OCRed files will be stored"
"``-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1``", "Define environment variable OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1"
"``-e OCR_ON_SUCCESS_DELETE=1``", "Define environment variable"
"``-e OCR_DESKEW=1``", "Define environment variable"
"``-e PYTHONBUFFERED=1``", "This will force STDOUT to be unbuffered and allow you to see messages in docker logs"
This service relies on polling to check for changes to the filesystem. It
may not be suitable for some environments, such as filesystems shared on a
slow network.
A configuration manager such as Docker Compose could be used to ensure that the
service is always available.
.. literalinclude:: ../misc/docker-compose.example.yml
:language: yaml
:caption: misc/docker-compose.example.yml
Caveats
-------
- ``watchmedo`` may not work properly on a networked file system,
depending on the capabilities of the file system client and server.
- This simple recipe does not filter for the type of file system event,
so file copies, deletes and moves, and directory operations, will all
be sent to ocrmypdf, producing errors in several cases. Disable your
watched folder if you are doing anything other than copying files to
it.
- If the source and destination directory are the same, watchmedo may
create an infinite loop.
- On BSD, FreeBSD and older versions of macOS, you may need to increase
the number of file descriptors to monitor more files, using
``ulimit -n 1024`` to watch a folder of up to 1024 files.
Alternatives
------------
- On Linux, `systemd user services <https://wiki.archlinux.org/index.php/Systemd/User>`__
can be configured to automatically perform OCR on a collection of files.
- `Watchman <https://facebook.github.io/watchman/>`__ is a more
powerful alternative to ``watchmedo``.
macOS Automator
===============
You can use the Automator app with macOS, to create a Workflow or Quick
Action. Use a *Run Shell Script* action in your workflow. In the context
of Automator, the ``PATH`` may be set differently your Terminal's
``PATH``; you may need to explicitly set the PATH to include
``ocrmypdf``. The following example may serve as a starting point:
.. figure:: images/macos-workflow.png
:alt: Example macOS Automator workflow
You may customize the command sent to ocrmypdf.

View File

@@ -1,5 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# ocrmypdf documentation build configuration file, created by
# sphinx-quickstart on Sun Sep 4 14:29:43 2016.
@@ -21,6 +20,8 @@
# import sys
# sys.path.insert(0, os.path.abspath('.'))
"""isort:skip_file"""
# -- General configuration ------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
@@ -30,9 +31,9 @@
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
# 'sphinx.ext.mathjax',
]
extensions = ['sphinx.ext.napoleon']
napoleon_use_rtype = False
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
@@ -53,7 +54,7 @@ master_doc = 'index'
# General information about the project.
project = 'ocrmypdf'
copyright = (
'2019, James R. Barlow. Licensed under Creative Commons Attribution-ShareAlike 4.0.'
'2020, James R. Barlow. Licensed under Creative Commons Attribution-ShareAlike 4.0.'
)
author = 'James R. Barlow'
@@ -92,6 +93,7 @@ from pkg_resources import get_distribution, DistributionNotFound
release = get_distribution('ocrmypdf').version
version = '.'.join(release.split('.')[:2])
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
@@ -176,7 +178,7 @@ html_theme_options = {'display_version': False}
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#
# html_logo = None
# html_logo = "images/logo.svg" # looks bad
# The name of an image file (relative to this directory) to use as a favicon of
# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32

67
docs/contributing.rst Normal file
View File

@@ -0,0 +1,67 @@
=======================
Contributing guidelines
=======================
Contributions are welcome!
Big changes
===========
Please open a new issue to discuss or propose a major change. Not only is it fun
to discuss big ideas, but we might save each other's time too. Perhaps some of the
work you're contemplating is already half-done in a development branch.
Code style
==========
We use PEP8, ``black`` for code formatting and ``isort`` for import sorting. The
settings for these programs are in ``pyproject.toml`` and ``setup.cfg``. Pull
requests should follow the style guide. One difference we use from "black" style
is that strings shown to the user are always in double quotes (``"``) and strings
for internal uses are in single quotes (``'``).
Tests
=====
New features should come with tests that confirm their correctness.
New Python dependencies
=======================
If you are proposing a change that will require a new Python dependency, we
prefer dependencies that are already packaged by Debian or Red Hat. This makes
life much easier for our downstream package maintainers.
Python dependencies must also be license-compatible. GPLv3 or AGPLv3 are likely
incompatible with the project's license, but LGPLv3 is compatible.
New non-Python dependencies
===========================
OCRmyPDF uses several external programs (Tesseract, Ghostscript and others) for
its functionality. In general we prefer to avoid adding new external programs.
Style guide: Is it OCRmyPDF or ocrmypdf?
========================================
The program/project is OCRmyPDF and the name of the executable or library is ocrmypdf.
Known ports/packagers
=====================
OCRmyPDF has been ported to many platforms already. If you are interesting in
porting to a new platform, check with
`Repology <https://repology.org/projects/?search=ocrmypdf>`__ to see the status
of that platform.
Packager maintainers, please ensure that the command line completion scripts in
``misc/`` are installed.
Copyright and license
=====================
For contributions over 10 lines of code, please include your name to list of
copyright holders for that file. The core program is licensed under MPL-2.0,
test files and documentation under CC-BY-SA 4.0, and miscellaneous files under
MIT. Please contribute code only that you wrote and you have the permission to
contribute or license to us.

View File

@@ -1,11 +1,12 @@
========
Cookbook
========
Basic examples
--------------
==============
Help!
^^^^^
-----
ocrmypdf has built-in help.
@@ -13,30 +14,29 @@ ocrmypdf has built-in help.
ocrmypdf --help
Add an OCR layer and convert to PDF/A
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-------------------------------------
.. code-block:: bash
ocrmypdf input.pdf output.pdf
Add an OCR layer and output a standard PDF
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
------------------------------------------
.. code-block:: bash
ocrmypdf --output-type pdf input.pdf output.pdf
Create a PDF/A with all color and grayscale images converted to JPEG
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--------------------------------------------------------------------
.. code-block:: bash
ocrmypdf --output-type pdfa --pdfa-image-compression jpeg input.pdf output.pdf
Modify a file in place
^^^^^^^^^^^^^^^^^^^^^^
----------------------
The file will only be overwritten if OCRmyPDF is successful.
@@ -45,48 +45,76 @@ The file will only be overwritten if OCRmyPDF is successful.
ocrmypdf myfile.pdf myfile.pdf
Correct page rotation
^^^^^^^^^^^^^^^^^^^^^
---------------------
OCR will attempt to automatic correct the rotation of each page. This can help fix a scanning job that contains a mix of landscape and portrait pages.
OCR will attempt to automatic correct the rotation of each page. This
can help fix a scanning job that contains a mix of landscape and
portrait pages.
.. code-block:: bash
ocrmypdf --rotate-pages myfile.pdf myfile.pdf
You can increase (decrease) the parameter ``--rotate-pages-threshold`` to make page rotation more (less) aggressive.
You can increase (decrease) the parameter ``--rotate-pages-threshold``
to make page rotation more (less) aggressive. The threshold number is the ratio
of how confidence the OCR engine is that the document image should be changed,
compared to kept the same. The default value is quite conservative; on some files
it may not attempt rotations at all unless it is very confident that the current
rotation is wrong. A lower value of ``2.0`` will produce more rotations, and
more false positives. Run with ``-v1`` to see the confidence level for each
page to see if there may be a better value for your files.
If the page is "just a little off horizontal", like a crooked picture, then you want ``--deskew``. ``--rotate-pages`` is for when the cardinal angle is wrong.
If the page is "just a little off horizontal", like a crooked picture,
then you want ``--deskew``. ``--rotate-pages`` is for when the cardinal
angle is wrong.
OCR languages other than English
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
--------------------------------
OCRmyPDF assumes the document is in English unless told otherwise. OCR quality may be poor if the wrong language is used.
OCRmyPDF assumes the document is in English unless told otherwise. OCR
quality may be poor if the wrong language is used.
.. code-block:: bash
ocrmypdf -l fra LeParisien.pdf LeParisien.pdf
ocrmypdf -l eng+fra Bilingual-English-French.pdf Bilingual-English-French.pdf
Language packs must be installed for all languages specified. See :ref:`Installing additional language packs <lang-packs>`.
Language packs must be installed for all languages specified. See
:ref:`Installing additional language packs <lang-packs>`.
Unfortunately, the Tesseract OCR engine has no ability to detect the language when it is unknown.
Unfortunately, the Tesseract OCR engine has no ability to detect the
language when it is unknown.
Produce PDF and text file containing OCR text
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
---------------------------------------------
This produces a file named "output.pdf" and a companion text file named "output.txt".
This produces a file named "output.pdf" and a companion text file named
"output.txt".
.. code-block:: bash
ocrmypdf --sidecar output.txt input.pdf output.pdf
.. note::
The sidecar file contains the **OCR text** found by OCRmyPDF. If the document
contains pages that already have text, that text will not appear in the
sidecar. If the option ``--pages`` is used, only those pages on which OCR
was performed will be included in the sidecar. If certain pages were skipped
because of options like ``--skip-big`` or ``--tesseract-timeout``, those pages
will not be in the sidecar.
To extract all text from a PDF, whether generated from OCR or otherwise,
use a program like Poppler's ``pdftotext`` or ``pdfgrep``.
OCR images, not PDFs
^^^^^^^^^^^^^^^^^^^^
--------------------
Option: use Tesseract
"""""""""""""""""""""
~~~~~~~~~~~~~~~~~~~~~
If you are starting with images, you can just use Tesseract directly to convert images to PDFs:
If you are starting with images, you can just use Tesseract directly to
convert images to PDFs:
.. code-block:: bash
@@ -97,62 +125,88 @@ If you are starting with images, you can just use Tesseract directly to convert
# When there are multiple images
tesseract text-file-containing-list-of-image-filenames.txt output-prefix pdf
Tesseract's PDF output is quite good  OCRmyPDF uses it internally, in some cases. However, OCRmyPDF has many features not available in Tesseract like image processing, metadata control, and PDF/A generation.
Tesseract's PDF output is quite good  OCRmyPDF uses it internally, in
some cases. However, OCRmyPDF has many features not available in
Tesseract like image processing, metadata control, and PDF/A generation.
Option: use img2pdf
"""""""""""""""""""
~~~~~~~~~~~~~~~~~~~
You can also use a program like `img2pdf <https://gitlab.mister-muffin.de/josch/img2pdf>`_ to convert your images to PDFs, and then pipe the results to run ocrmypdf. The ``-`` tells ocrmypdf to read standard input.
You can also use a program like
`img2pdf <https://gitlab.mister-muffin.de/josch/img2pdf>`__ to convert
your images to PDFs, and then pipe the results to run ocrmypdf. The
``-`` tells ocrmypdf to read standard input.
.. code-block:: bash
img2pdf my-images*.jpg | ocrmypdf - myfile.pdf
``img2pdf`` is recommended because it does an excellent job at generating PDFs without transcoding images.
``img2pdf`` is recommended because it does an excellent job at
generating PDFs without transcoding images.
Option: use OCRmyPDF (single images only)
"""""""""""""""""""""""""""""""""""""""""
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
For convenience, OCRmyPDF can also convert single images to PDFs on its own. If the resolution (dots per inch, DPI) of an image is not set or is incorrect, it can be overridden with ``--image-dpi``. (As 1 inch is 2.54 cm, 1 dpi = 0.39 dpcm).
For convenience, OCRmyPDF can also convert single images to PDFs on its
own. If the resolution (dots per inch, DPI) of an image is not set or is
incorrect, it can be overridden with ``--image-dpi``. (As 1 inch is 2.54
cm, 1 dpi = 0.39 dpcm).
.. code-block:: bash
ocrmypdf --image-dpi 300 image.png myfile.pdf
If you have multiple images, you must use ``img2pdf`` to convert the images to PDF.
If you have multiple images, you must use ``img2pdf`` to convert the
images to PDF.
Not recommended
"""""""""""""""
~~~~~~~~~~~~~~~
We caution against using ImageMagick or Ghostscript to convert images to PDF, since they may transcode images or produce downsampled images, sometimes without warning.
We caution against using ImageMagick or Ghostscript to convert images to
PDF, since they may transcode images or produce downsampled images,
sometimes without warning.
Image processing
----------------
================
OCRmyPDF perform some image processing on each page of a PDF, if desired. The same processing is applied to each page. It is suggested that the user review files after image processing as these commands might remove desirable content, especially from poor quality scans.
OCRmyPDF perform some image processing on each page of a PDF, if
desired. The same processing is applied to each page. It is suggested
that the user review files after image processing as these commands
might remove desirable content, especially from poor quality scans.
* ``--rotate-pages`` attempts to determine the correct orientation for each page and rotates the page if necessary.
* ``--remove-background`` attempts to detect and remove a noisy background from grayscale or color images. Monochrome images are ignored. This should not be used on documents that contain color photos as it may remove them.
* ``--deskew`` will correct pages were scanned at a skewed angle by rotating them back into place. Skew determination and correction is performed using `Postl's variance of line sums <http://www.leptonica.com/skew-measurement.html>`_ algorithm as implemented in `Leptonica <http://www.leptonica.com/index.html>`_.
* ``--clean`` uses `unpaper <https://www.flameeyes.eu/projects/unpaper>`_ to clean up pages before OCR, but does not alter the final output. This makes it less likely that OCR will try to find text in background noise.
* ``--clean-final`` uses unpaper to clean up pages before OCR and inserts the page into the final output. You will want to review each page to ensure that unpaper did not remove something important.
* ``--mask-barcodes`` will suppress any barcodes detected in a page image. Barcodes are known to confuse Tesseract OCR and interfere with the recognition of text on the same baseline as a barcode. The output file will contain the unaltered image of the barcode.
- ``--rotate-pages`` attempts to determine the correct orientation for
each page and rotates the page if necessary.
- ``--remove-background`` attempts to detect and remove a noisy
background from grayscale or color images. Monochrome images are
ignored. This should not be used on documents that contain color
photos as it may remove them.
- ``--deskew`` will correct pages were scanned at a skewed angle by
rotating them back into place. Skew determination and correction is
performed using `Postl's variance of line
sums <http://www.leptonica.org/skew-measurement.html>`__ algorithm as
implemented in `Leptonica <http://www.leptonica.org/index.html>`__.
- ``--clean`` uses
`unpaper <https://www.flameeyes.eu/projects/unpaper>`__ to clean up
pages before OCR, but does not alter the final output. This makes it
less likely that OCR will try to find text in background noise.
- ``--clean-final`` uses unpaper to clean up pages before OCR and
inserts the page into the final output. You will want to review each
page to ensure that unpaper did not remove something important.
.. note::
In many cases image processing will rasterize PDF pages as images, potentially losing quality.
In many cases image processing will rasterize PDF pages as images,
potentially losing quality.
.. warning::
``--clean-final`` and ``-remove-background`` may leave undesirable visual artifacts in some images where their algorithms have shortcomings. Files should be visually reviewed after using these options.
``--clean-final`` and ``-remove-background`` may leave undesirable
visual artifacts in some images where their algorithms have
shortcomings. Files should be visually reviewed after using these
options.
Example: OCR and correct document skew (crooked scan)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-----------------------------------------------------
Deskew:
@@ -160,55 +214,118 @@ Deskew:
ocrmypdf --deskew input.pdf output.pdf
Image processing commands can be combined. The order in which options are given does not matter. OCRmyPDF always applies the steps of the image processing pipeline in the same order (rotate, remove background, deskew, clean).
Image processing commands can be combined. The order in which options
are given does not matter. OCRmyPDF always applies the steps of the
image processing pipeline in the same order (rotate, remove background,
deskew, clean).
.. code-block:: bash
ocrmypdf --deskew --clean --rotate-pages input.pdf output.pdf
Don't actually OCR my PDF
-------------------------
=========================
If you set ``--tesseract-timeout 0`` OCRmyPDF will apply its image processing without performing OCR, if all you want to is to apply image processing or PDF/A conversion.
If you set ``--tesseract-timeout 0`` OCRmyPDF will apply its image
processing without performing OCR, if all you want to is to apply image
processing or PDF/A conversion.
.. code-block:: bash
ocrmypdf --tesseract-timeout=0 --remove-background input.pdf output.pdf
Optimize images without performing OCR
--------------------------------------
You can also optimize all images without performing any OCR:
.. code-block:: bash
ocrmypdf --tesseract-timeout=0 --optimize 3 --skip-text input.pdf output.pdf
Perform OCR only certain pages
------------------------------
You can ask OCRmyPDF to only apply OCR to certain pages.
.. code-block:: bash
ocrmypdf --pages 2,3,13-17 input.pdf output.pdf
Hyphens denote a range of pages and commas separate page numbers. If you prefer
to use spaces, quote all of the page numbers: ``--pages '2, 3, 5, 7'``.
OCRmyPDF will warn if your list of page numbers contains duplicates or
overlap pages. OCRmyPDF does not currently account for document page numbers,
such as an introduction section of a book that uses Roman numerals. It simply
counts the number of virtual pieces of paper since the start.
Regardless of the argument to ``--pages``, OCRmyPDF will optimize all pages in
the file and convert it to PDF/A, unless you disable those options. In this
example, we want to OCR only the title and otherwise change the PDF as little
as possible:
.. code-block:: bash
ocrmypdf --pages 1 --output-type pdf --optimize 0 input.pdf output.pdf
Redo existing OCR
-----------------
=================
To redo OCR on a file OCRed with other OCR software or a previous version of OCRmyPDF and/or Tesseract, you may use the ``--redo-ocr`` argument. (Normally, OCRmyPDF will exit with an error if asked to modify a file with OCR.)
To redo OCR on a file OCRed with other OCR software or a previous
version of OCRmyPDF and/or Tesseract, you may use the ``--redo-ocr``
argument. (Normally, OCRmyPDF will exit with an error if asked to modify
a file with OCR.)
This may be helpful for users who want to take advantage of accuracy improvements in Tesseract 4.0 for files they previously OCRed with an earlier version of Tesseract and OCRmyPDF.
This may be helpful for users who want to take advantage of accuracy
improvements in Tesseract 4.0 for files they previously OCRed with an
earlier version of Tesseract and OCRmyPDF.
.. code-block:: bash
ocrmypdf --redo-ocr input.pdf output.pdf
This method will replace OCR without rasterizing, reducing quality or removing vector content. If a file contains a mix of pure digital text and OCR, digital text will be ignored and OCR will be replaced. As such this mode is incompatible with image processing options, since they alter the appearance of the file.
This method will replace OCR without rasterizing, reducing quality or
removing vector content. If a file contains a mix of pure digital text
and OCR, digital text will be ignored and OCR will be replaced. As such
this mode is incompatible with image processing options, since they
alter the appearance of the file.
In some cases, existing OCR cannot be detected or replaced. Files produced by OCRmyPDF v2.2 or earlier, for example, are internally represented as having visible text with an opaque image drawn on top. This situation cannot be detected.
In some cases, existing OCR cannot be detected or replaced. Files
produced by OCRmyPDF v2.2 or earlier, for example, are internally
represented as having visible text with an opaque image drawn on top.
This situation cannot be detected.
If ``--redo-ocr`` does not work, you can use ``--force-ocr``, which will force rasterization of all pages, potentially reducing quality or losing vector content.
If ``--redo-ocr`` does not work, you can use ``--force-ocr``, which will
force rasterization of all pages, potentially reducing quality or losing
vector content.
Improving OCR quality
---------------------
=====================
The `Image processing`_ features can improve OCR quality.
The `Image processing <#image-processing>`__ features can improve OCR
quality.
Rotating pages and deskewing helps to ensure that the page orientation is correct before OCR begins. Removing the background and/or cleaning the page can also improve results. The ``--oversample DPI`` argument can be specified to resample images to higher resolution before attempting OCR; this can improve results as well.
Rotating pages and deskewing helps to ensure that the page orientation
is correct before OCR begins. Removing the background and/or cleaning
the page can also improve results. The ``--oversample DPI`` argument can
be specified to resample images to higher resolution before attempting
OCR; this can improve results as well.
OCR quality will suffer if the resolution of input images is not correct (since the range of pixel sizes that will be checked for possible fonts will also be incorrect).
OCR quality will suffer if the resolution of input images is not correct
(since the range of pixel sizes that will be checked for possible fonts
will also be incorrect).
PDF optimization
----------------
================
By default OCRmyPDF will attempt to perform lossless optimizations on the images inside PDFs after OCR is complete. Optimization is performed even if no OCR text is found.
By default OCRmyPDF will attempt to perform lossless optimizations on
the images inside PDFs after OCR is complete. Optimization is performed
even if no OCR text is found.
The ``--optimize N`` (short form ``-O``) argument controls optimization, where ``N`` ranges from 0 to 3 inclusive, analogous to the optimization levels in the GCC compiler.
The ``--optimize N`` (short form ``-O``) argument controls optimization,
where ``N`` ranges from 0 to 3 inclusive, analogous to the optimization
levels in the GCC compiler.
.. list-table::
:widths: auto
@@ -227,9 +344,15 @@ The ``--optimize N`` (short form ``-O``) argument controls optimization, where `
* - ``--optimize 3``
- All of the above, and enables more aggressive optimizations and targets lower image quality.
Optimization is improved when a JBIG2 encoder is available and when ``pngquant`` is installed. If either of these components are missing, then some types of images cannot be optimized.
Optimization is improved when a JBIG2 encoder is available and when
``pngquant`` is installed. If either of these components are missing,
then some types of images cannot be optimized.
The types of optimization available may expand over time. By default, OCRmyPDF compresses data streams inside PDFs, and will change inefficient compression modes to more modern versions. A program like ``qpdf`` can be used to change encodings, e.g. to inspect the internals fo a PDF.
The types of optimization available may expand over time. By default,
OCRmyPDF compresses data streams inside PDFs, and will change
inefficient compression modes to more modern versions. A program like
``qpdf`` can be used to change encodings, e.g. to inspect the internals
fo a PDF.
.. code-block:: bash

View File

@@ -1,155 +1,196 @@
.. _docker:
=====================
OCRmyPDF Docker image
=====================
OCRmyPDF is also available in a Docker image that packages recent versions of all dependencies.
OCRmyPDF is also available in a Docker image that packages recent
versions of all dependencies.
For users who already have Docker installed this may be an easy and convenient option. However, it is less performant than a system installation and may require Docker engine configuration.
For users who already have Docker installed this may be an easy and
convenient option. However, it is less performant than a system
installation and may require Docker engine configuration.
OCRmyPDF needs a generous amount of RAM, CPU cores, and temporary storage space.
OCRmyPDF needs a generous amount of RAM, CPU cores, temporary storage
space, whether running in a Docker container or on its own. It may be
necessary to ensure the container is provisioned with additional
resources.
.. _docker-install:
Installing the Docker image
---------------------------
===========================
If you have `Docker <https://docs.docker.com/>`_ installed on your system, you can install a Docker image of the latest release.
If you have `Docker <https://docs.docker.com/>`__ installed on your
system, you can install a Docker image of the latest release.
The recommended OCRmyPDF Docker image is currently named ``ocrmypdf-alpine``:
If you can run this command successfully, your system is ready to download and
execute the image:
.. code-block:: bash
docker pull jbarlow83/ocrmypdf-alpine
docker run hello-world
Follow the Docker installation instructions for your platform. If you can run this command successfully, your system is ready to download and execute the image:
The recommended OCRmyPDF Docker image is currently named ``ocrmypdf``:
.. code-block:: bash
docker run hello-world
docker pull jbarlow83/ocrmypdf
OCRmyPDF will use all available CPU cores. By default, the VirtualBox machine instance on Windows and macOS has only a single CPU core enabled. Use the VirtualBox Manager to determine the name of your Docker engine host, and then follow these optional steps to enable multiple CPUs:
OCRmyPDF will use all available CPU cores. By default, the VirtualBox
machine instance on Windows and macOS has only a single CPU core
enabled. Use the VirtualBox Manager to determine the name of your Docker
engine host, and then follow these optional steps to enable multiple
CPUs:
.. code-block:: bash
# Optional step for Mac OS X users
docker-machine stop "yourVM"
VBoxManage modifyvm "yourVM" --cpus 2 # or whatever number of core is desired
docker-machine start "yourVM"
eval $(docker-machine env "yourVM")
# Optional step for Mac OS X users
docker-machine stop "yourVM"
VBoxManage modifyvm "yourVM" --cpus 2 # or whatever number of core is desired
docker-machine start "yourVM"
eval $(docker-machine env "yourVM")
See the Docker documentation for
`adjusting memory and CPU on other platforms <https://docs.docker.com/config/containers/resource_constraints/>`__.
Using the Docker image on the command line
------------------------------------------
==========================================
**Unlike typical Docker containers**, in this mode we are using the OCRmyPDF Docker container is intended to be emphemeral it runs for one OCR job and then terminates, just like a command line program. We are using Docker as a way of delivering an application, not a server.
**Unlike typical Docker containers**, in this section the OCRmyPDF Docker
container is emphemeral it runs for one OCR job and terminates, just like a
command line program. We are using Docker to deliver an application (as opposed
to the more conventional case, where a Docker container runs as a server).
To start a Docker container (instance of the image):
.. code-block:: bash
docker tag jbarlow83/ocrmypdf-alpine ocrmypdf
docker run --rm ocrmypdf (... all other arguments here...)
docker tag jbarlow83/ocrmypdf ocrmypdf
docker run --rm -i ocrmypdf (... all other arguments here...) - -
For convenience, create a shell alias to hide the Docker command:
For convenience, create a shell alias to hide the Docker command. It is
easier to send the input file as stdin and read the output from
stdout **this avoids the messy permission issues with Docker entirely**.
.. code-block:: bash
alias ocrmypdf='docker run --rm -v "$(pwd):/home/docker" ocrmypdf'
ocrmypdf --version # runs docker version
alias docker_ocrmypdf='docker run --rm -i ocrmypdf'
docker_ocrmypdf --version # runs docker version
docker_ocrmypdf - - <input.pdf >output.pdf
Or in the wonderful `fish shell <https://fishshell.com/>`_:
Or in the wonderful `fish shell <https://fishshell.com/>`__:
.. code-block:: fish
alias ocrmypdf 'docker run --rm ocrmypdf'
funcsave ocrmypdf
alias docker_ocrmypdf 'docker run --rm ocrmypdf'
funcsave docker_ocrmypdf
Alternately, you could mount the local current working directory as a
Docker volume:
.. code-block:: bash
alias docker_ocrmypdf='docker run --rm -i --user "$(id -u):$(id -g)" --workdir /data -v "$PWD:/data" ocrmypdf'
docker_ocrmypdf /data/input.pdf /data/output.pdf
.. _docker-lang-packs:
Adding languages to the Docker image
------------------------------------
====================================
By default the Docker image includes English, German and Simplified Chinese, the most popular languages for OCRmyPDF users based on feedback. You may add other languages by creating a new Dockerfile based on the public one:
By default the Docker image includes English, German, Simplified Chinese,
French, Portuguese and Spanish, the most popular languages for OCRmyPDF
users based on feedback. You may add other languages by creating a new
Dockerfile based on the public one.
.. code-block:: dockerfile
FROM jbarlow83/ocrmypdf-alpine
FROM jbarlow83/ocrmypdf
# Add French
RUN apk add tesseract-ocr-data-fra
# Example: add Italian
RUN apt install tesseract-ocr-ita
To install language packs (training data) such as the
`tessdata_best <https://github.com/tesseract-ocr/tessdata_best>`_ suite or
custom data, you first need to determine the version of Tesseract data files, which
may differ from the Tesseract program version. Use this command to determine the data
file version:
.. code-block:: bash
docker run -i --rm --entrypoint /bin/ls jbarlow83/ocrmypdf /usr/share/tesseract-ocr
As of 2021, the data file version is probably ``4.00``.
You can then add new data with either a Dockerfile:
.. code-block:: dockerfile
FROM jbarlow83/ocrmypdf
# Example: add a tessdata_best file
COPY chi_tra_vert.traineddata /usr/share/tesseract-ocr/<data version>/tessdata/
Alternately, you can copy training data into a Docker container as follows:
.. code-block:: bash
docker cp mycustomtraining.traineddata name_of_container:/usr/share/tesseract-ocr/<tesseract version>/tessdata/
Executing the test suite
------------------------
========================
The OCRmyPDF test suite is installed with image. To run it:
The OCRmyPDF test suite is installed with image. To run it:
.. code-block:: bash
docker run --entrypoint python3 jbarlow83/ocrmypdf-alpine setup.py test
docker run --entrypoint python3 jbarlow83/ocrmypdf -m pytest
Accessing the shell
===================
To use the bash shell in the Docker image:
.. code-block:: bash
docker run -it --entrypoint bash jbarlow83/ocrmypdf
Using the OCRmyPDF web service wrapper
--------------------------------------
======================================
The OCRmyPDF Docker image includes an example, barebones HTTP web service. The webservice may be launched as follows:
The OCRmyPDF Docker image includes an example, barebones HTTP web
service. The webservice may be launched as follows:
.. code-block:: bash
docker run --entrypoint python3 -p 5000:5000 jbarlow83/ocrmypdf-alpine webservice.py
docker run --entrypoint python3 -p 5000:5000 jbarlow83/ocrmypdf webservice.py
Unlike command line usage this program will open a socket and wait for connections.
This will configure the machine to listen on port 5000. On Linux machines
this is port 5000 of localhost. On macOS or Windows machines running
Docker, this is port 5000 of the virtual machine that runs your Docker
images. You can find its IP address using the command ``docker-machine ip``.
Unlike command line usage this program will open a socket and wait for
connections.
.. warning::
The OCRmyPDF web service wrapper is intended for demonstration or development. It provides no security, no authentication, no protection against denial of service attacks, and no load balancing. The default Flask WSGI server is used, which is intended for development only. The server is single-threaded and so can respond to only one client at a time. It cannot respond to clients while busy with OCR.
The OCRmyPDF web service wrapper is intended for demonstration or
development. It provides no security, no authentication, no
protection against denial of service attacks, and no load balancing.
The default Flask WSGI server is used, which is intended for
development only. The server is single-threaded and so can respond to
only one client at a time. While running OCR, it cannot respond to
any other clients.
Clients must keep their open connection while waiting for OCR to complete. This may entail setting a long timeout; this interface is more useful for internal HTTP API calls.
Clients must keep their open connection while waiting for OCR to
complete. This may entail setting a long timeout; this interface is more
useful for internal HTTP API calls.
Unlike the rest of OCRmyPDF, this web service is licensed under the Affero GPLv3 (AGPLv3) since Ghostscript, a dependency of OCRmyPDF, is also licensed in this way.
Unlike the rest of OCRmyPDF, this web service is licensed under the
Affero GPLv3 (AGPLv3) since Ghostscript is also licensed in this way.
In addition to the above, please read our :ref:`general remarks on using OCRmyPDF as a service <ocr-service>`.
Legacy Ubuntu Docker images
---------------------------
Previously OCRmyPDF was delivered in several Docker images for different purposes, based on Ubuntu.
The Ubuntu-based images will be maintained for some time but should not be used for new deployments. They are as follows:
.. list-table::
:widths: auto
:header-rows: 1
* - Image name
- Download command
- Notes
* - ocrmypdf
- ``docker pull jbarlow83/ocrmypdf``
- Latest ocrmypdf with Tesseract 4.0.0-beta1 on Ubuntu 18.04. Includes English, French, German, Spanish, Portugeuse and Simplified Chinese.
* - ocrmypdf-polyglot
- ``docker pull jbarlow83/ocrmypdf-polyglot``
- As above, with all available language packs.
* - ocrmypdf-webservice
- ``docker pull jbarlow83/ocrmypdf-webservice``
- All language packs, and a simple HTTP wrapper allowing OCRmyPDF to be used as a web service. Note that this component is licensed under AGPLv3.
To execute the Ubuntu-based OCRmyPDF on a local file, you must `provide a writable volume to the Docker image <https://docs.docker.com/userguide/dockervolumes/>`_, and both the input and output file must be inside the writable volume. This limitation applies only to the legacy images.
This example command uses the current working directory as the writable volume:
.. code-block:: bash
docker run --rm -v "$(pwd):/home/docker" <other docker arguments> ocrmypdf <your arguments to ocrmypdf>
In this worked example, the current working directory contains an input file called ``test.pdf`` and the output will go to ``output.pdf``:
.. code-block:: bash
docker run --rm -v "$(pwd):/home/docker" ocrmypdf --skip-text test.pdf output.pdf
.. note:: The working directory should be a writable local volume or Docker may not have permission to access it.
Note that ``ocrmypdf`` has its own separate ``-v VERBOSITYLEVEL`` argument to control debug verbosity. All Docker arguments should before the ``ocrmypdf`` image name and all arguments to ``ocrmypdf`` should be listed after.
In some environments the permissions associated with Docker can be complex to configure. The process that executes Docker may end up not having the permissions to write the specified file system. In that case one can stream the file into and out of the Docker process and avoid all permission hassles, using ``-`` as the input and output filename:
.. code-block:: bash
docker run --rm -i ocrmypdf <other arguments to ocrmypdf> - - <input.pdf >output.pdf
In addition to the above, please read our
:ref:`general remarks on using OCRmyPDF as a service <ocr-service>`.

View File

@@ -1,33 +1,53 @@
=====================
Common error messages
=====================
Page already has text
---------------------
=====================
.. code::
.. code-block::
ERROR - 1: page already has text! aborting (use --force-ocr to force OCR)
ERROR - 1: page already has text! aborting (use --force-ocr to force OCR)
You ran ocrmypdf on a file that already contains printable text or a hidden OCR text layer (it can't quite tell the difference). You probably don't want to do this, because the file is already searchable.
You ran ocrmypdf on a file that already contains printable text or a
hidden OCR text layer (it can't quite tell the difference). You probably
don't want to do this, because the file is already searchable.
As the error message suggests, your options are:
- ``ocrmypdf --force-ocr`` to :ref:`rasterize <raster-vector>` all vector content and run OCR on the images. This is useful if a previous OCR program failed, or if the document contains a text watermark.
- ``ocrmypdf --skip-text`` to skip OCR and other processing on any pages that contain text. Text pages will be copied into the output PDF without modification.
- ``ocrmypdf --force-ocr`` to :ref:`rasterize <raster-vector>` all
vector content and run OCR on the images. This is useful if a
previous OCR program failed, or if the document contains a text
watermark.
- ``ocrmypdf --skip-text`` to skip OCR and other processing on any
pages that contain text. Text pages will be copied into the output
PDF without modification.
- ``ocrmypdf --redo-ocr`` to scan the file for any existing OCR
(non-printing text), remove it, and do OCR again. This is one way
to take advantage of improvements in OCR accuracy. Printable vector
text is excluded from OCR, so this can be used on files that contain
a mix of digital and scanned files.
Input file 'filename' is not a valid PDF
----------------------------------------
========================================
OCRmyPDF passes files through qpdf, a program that fixes errors in PDFs, before it tries to work on them. In most cases this happens because the PDF is corrupt and
truncated (incomplete file copying) and not much can be done.
OCRmyPDF checks files with pikepdf, a library that in turn uses libqpdf to fixes
errors in PDFs, before it tries to work on them. In most cases this happens
because the PDF is corrupt and truncated (incomplete file copying) and not much
can be done.
You can try rewriting the file with Ghostscript or pdftk:
You can try rewriting the file with Ghostscript:
- ``gs -o output.pdf -dSAFER -sDEVICE=pdfwrite input.pdf``
.. code-block:: bash
- ``pdftk input.pdf cat output output.pdf``
gs -o output.pdf -dSAFER -sDEVICE=pdfwrite input.pdf
Sometimes Acrobat can repair PDFs with its `Preflight tool <https://helpx.adobe.com/acrobat/using/correcting-problem-areas-preflight-tool.html>`_.
``pdftk`` can also rewrite PDFs:
.. code-block:: bash
pdftk input.pdf cat output output.pdf
Sometimes Acrobat can repair PDFs with its `Preflight
tool <https://helpx.adobe.com/acrobat/using/correcting-problem-areas-preflight-tool.html>`__.

BIN
docs/images/logo-social.png Normal file
View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

75
docs/images/logo.svg Normal file
View File

@@ -0,0 +1,75 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg width="100%" height="100%" viewBox="0 0 503 227" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1.5;">
<g id="svg" transform="matrix(0.965977,0,0,0.807602,0,0)">
<rect x="0" y="0" width="520" height="280" style="fill:white;"/>
<g transform="matrix(1.03522,0,0,1.23823,-69.7528,-83.422)">
<g transform="matrix(1,0,0,1,243.977,20.0703)">
<g id="Page">
<g transform="matrix(0.961773,0,0,1.05962,6.19811,-3.01071)">
<path d="M328.5,97.682C328.5,96.465 327.983,95.296 327.056,94.418C320.026,87.758 289.442,58.78 282.228,51.944C281.251,51.019 279.901,50.496 278.49,50.496C264.493,50.496 188.083,50.496 167.339,50.496C164.468,50.496 162.141,52.609 162.141,55.214C162.141,83.051 162.141,225.565 162.141,253.4C162.141,256.005 164.468,258.117 167.338,258.117C192.242,258.117 299.159,258.117 323.538,258.117C326.278,258.117 328.5,256.101 328.5,253.613C328.5,229.268 328.5,113.896 328.5,97.682Z" style="fill:rgb(253,253,253);stroke:rgb(51,51,51);stroke-width:3.95px;"/>
</g>
<g id="Dog-ear" serif:id="Dog ear" transform="matrix(1,0,0,1,-4,2)">
<path d="M277.072,48.496L277.072,93.848C277.072,95.172 277.598,96.441 278.534,97.377C279.47,98.313 280.739,98.839 282.063,98.839C294.548,98.839 326.141,98.839 326.141,98.839" style="fill:rgb(245,245,245);stroke:rgb(51,51,51);stroke-width:4px;"/>
</g>
</g>
<g transform="matrix(1,0,0,1,-29.6816,-0.395178)">
<g transform="matrix(1.00243,0,0,1.11818,-144.72,-8.80181)">
<path d="M465.73,119.654C465.73,117.605 463.874,115.941 461.588,115.941L310.259,115.941C307.973,115.941 306.117,117.605 306.117,119.654L306.117,183.108C306.117,185.157 307.973,186.821 310.259,186.821L461.588,186.821C463.874,186.821 465.73,185.157 465.73,183.108L465.73,119.654Z" style="fill:rgb(248,0,0);stroke:white;stroke-width:3.77px;"/>
</g>
<g transform="matrix(1.24571,0,0,1.35864,116.812,84.3924)">
<g transform="matrix(64,0,0,64,42.1437,77.6203)">
<path d="M0.084,0L0.084,-0.68L0.297,-0.68C0.371,-0.68 0.434,-0.663 0.487,-0.63C0.54,-0.596 0.566,-0.54 0.566,-0.462C0.566,-0.385 0.538,-0.328 0.481,-0.292C0.424,-0.255 0.36,-0.237 0.288,-0.237L0.213,-0.237L0.213,0L0.084,0ZM0.293,-0.572L0.213,-0.572L0.213,-0.344L0.295,-0.344C0.334,-0.344 0.365,-0.353 0.389,-0.371C0.413,-0.388 0.426,-0.416 0.429,-0.454C0.429,-0.498 0.417,-0.529 0.393,-0.546C0.369,-0.563 0.336,-0.572 0.293,-0.572Z" style="fill:white;fill-rule:nonzero;"/>
</g>
<g transform="matrix(64,0,0,64,79.7117,77.6203)">
<path d="M0.332,0L0.084,0L0.084,-0.68L0.336,-0.68C0.441,-0.68 0.518,-0.648 0.569,-0.585C0.62,-0.522 0.645,-0.441 0.645,-0.344C0.645,-0.239 0.618,-0.155 0.563,-0.093C0.508,-0.031 0.431,0 0.332,0ZM0.337,-0.57L0.213,-0.57L0.213,-0.109L0.33,-0.109C0.385,-0.109 0.429,-0.127 0.462,-0.163C0.495,-0.199 0.511,-0.259 0.511,-0.344C0.511,-0.415 0.497,-0.47 0.469,-0.51C0.441,-0.55 0.397,-0.57 0.337,-0.57Z" style="fill:white;fill-rule:nonzero;"/>
</g>
<g transform="matrix(64,0,0,64,123.424,77.6203)">
<path d="M0.405,-0.288L0.213,-0.288L0.213,0L0.084,0L0.084,-0.68L0.469,-0.68L0.489,-0.578L0.213,-0.578L0.213,-0.389L0.386,-0.389L0.405,-0.288Z" style="fill:white;fill-rule:nonzero;"/>
</g>
</g>
</g>
</g>
<g transform="matrix(1,0,0,1.52217,67.3796,10.7507)">
<rect x="23.501" y="81.3" width="162.305" height="61.77" style="fill:rgb(180,213,255);"/>
</g>
<g transform="matrix(0.967536,0,0,0.961535,5.90498,47.9703)">
<g transform="matrix(90.4804,0,0,90.4804,82.6698,167.705)">
<path d="M0.057,-0.337C0.057,-0.442 0.084,-0.527 0.139,-0.594C0.194,-0.66 0.271,-0.694 0.37,-0.696C0.477,-0.696 0.556,-0.662 0.607,-0.593C0.658,-0.524 0.684,-0.441 0.684,-0.344C0.684,-0.239 0.657,-0.153 0.602,-0.086C0.547,-0.019 0.469,0.014 0.37,0.014C0.264,0.014 0.185,-0.02 0.134,-0.089C0.083,-0.157 0.057,-0.24 0.057,-0.337ZM0.192,-0.338C0.192,-0.267 0.206,-0.208 0.235,-0.163C0.264,-0.118 0.308,-0.095 0.369,-0.095C0.424,-0.095 0.467,-0.115 0.5,-0.156C0.533,-0.197 0.549,-0.259 0.549,-0.344C0.549,-0.415 0.535,-0.473 0.506,-0.518C0.477,-0.563 0.433,-0.586 0.372,-0.586C0.319,-0.586 0.275,-0.564 0.242,-0.519C0.209,-0.474 0.192,-0.414 0.192,-0.338Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
</g>
<g transform="matrix(90.4804,0,0,90.4804,147.906,167.705)">
<path d="M0.505,-0.557C0.473,-0.567 0.448,-0.574 0.429,-0.579C0.41,-0.583 0.388,-0.585 0.361,-0.585C0.307,-0.585 0.265,-0.563 0.236,-0.519C0.207,-0.475 0.192,-0.415 0.192,-0.338C0.192,-0.272 0.204,-0.215 0.229,-0.167C0.254,-0.119 0.295,-0.095 0.353,-0.095C0.382,-0.095 0.409,-0.098 0.434,-0.104C0.459,-0.11 0.481,-0.117 0.502,-0.126L0.551,-0.03C0.525,-0.017 0.494,-0.006 0.457,0.002C0.42,0.01 0.388,0.014 0.36,0.014C0.254,0.014 0.177,-0.02 0.129,-0.088C0.081,-0.156 0.057,-0.239 0.057,-0.337C0.057,-0.442 0.084,-0.527 0.137,-0.594C0.19,-0.661 0.266,-0.694 0.365,-0.694C0.385,-0.694 0.413,-0.691 0.448,-0.684C0.483,-0.677 0.516,-0.666 0.545,-0.65L0.505,-0.557Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
</g>
<g transform="matrix(90.4804,0,0,90.4804,199.751,167.705)">
<path d="M0.293,-0.572L0.213,-0.572L0.213,-0.364L0.295,-0.364C0.334,-0.364 0.366,-0.372 0.391,-0.388C0.416,-0.403 0.429,-0.429 0.429,-0.465C0.429,-0.503 0.417,-0.53 0.393,-0.547C0.369,-0.564 0.336,-0.572 0.293,-0.572ZM0.479,0L0.335,-0.26C0.328,-0.259 0.32,-0.259 0.312,-0.259C0.304,-0.258 0.296,-0.258 0.288,-0.258L0.213,-0.258L0.213,0L0.084,0L0.084,-0.68L0.297,-0.68C0.371,-0.68 0.434,-0.663 0.487,-0.629C0.54,-0.595 0.566,-0.542 0.566,-0.471C0.566,-0.429 0.555,-0.393 0.534,-0.363C0.512,-0.332 0.484,-0.309 0.45,-0.292L0.617,0L0.479,0Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
</g>
</g>
<g transform="matrix(0.916882,0,0,1,121.475,-32.6535)">
<g transform="matrix(86.953,0,0,86.953,152.996,241.878)">
<path d="M0.479,-0.428C0.5,-0.451 0.527,-0.47 0.562,-0.484C0.596,-0.497 0.627,-0.504 0.654,-0.504C0.72,-0.504 0.767,-0.485 0.795,-0.446C0.822,-0.407 0.836,-0.36 0.836,-0.304L0.836,0L0.705,0L0.705,-0.298C0.705,-0.329 0.698,-0.352 0.683,-0.369C0.668,-0.385 0.647,-0.393 0.619,-0.393C0.6,-0.393 0.581,-0.388 0.56,-0.378C0.539,-0.368 0.521,-0.357 0.504,-0.344C0.505,-0.337 0.505,-0.331 0.506,-0.324C0.507,-0.317 0.507,-0.311 0.507,-0.304L0.507,0L0.376,0L0.376,-0.298C0.376,-0.329 0.369,-0.352 0.354,-0.369C0.339,-0.385 0.318,-0.393 0.291,-0.393C0.274,-0.393 0.258,-0.39 0.241,-0.383C0.224,-0.376 0.207,-0.367 0.192,-0.356L0.192,0L0.062,0L0.062,-0.485L0.13,-0.485L0.162,-0.441C0.184,-0.461 0.211,-0.476 0.242,-0.488C0.273,-0.499 0.3,-0.504 0.325,-0.504C0.363,-0.504 0.395,-0.497 0.42,-0.484C0.445,-0.47 0.465,-0.451 0.479,-0.428Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
</g>
<g transform="matrix(86.953,0,0,86.953,228.906,241.878)">
<path d="M0.156,0.023L0.179,-0.034L0.006,-0.467L0.14,-0.485L0.252,-0.191L0.358,-0.485L0.495,-0.485L0.278,0.064C0.263,0.103 0.236,0.137 0.197,0.165C0.158,0.193 0.118,0.212 0.075,0.222L0.029,0.115C0.052,0.105 0.077,0.093 0.104,0.079C0.13,0.064 0.147,0.046 0.156,0.023Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
</g>
</g>
<g id="Selectors" transform="matrix(0.965977,0,0,0.807602,67.3796,67.3718)">
<g id="Right-selector" serif:id="Right selector">
<g transform="matrix(1.03522,0,0,1.23823,2.07044,0)">
<path d="M185.806,161.156L185.806,67.132" style="fill:none;stroke:rgb(76,159,255);stroke-width:4px;stroke-linecap:butt;"/>
</g>
<g transform="matrix(1.03522,0,0,1.23823,161.788,169.469)">
<circle cx="31.523" cy="34.314" r="10.021" style="fill:rgb(76,159,255);stroke:rgb(76,159,255);stroke-width:4px;stroke-linecap:butt;"/>
</g>
</g>
<g id="Left-selector" serif:id="Left selector">
<g transform="matrix(1.03522,0,0,1.23823,-170.092,0)">
<path d="M185.806,161.156L185.806,67.132" style="fill:none;stroke:rgb(76,159,255);stroke-width:4px;stroke-linecap:butt;"/>
</g>
<g transform="matrix(1.03522,0,0,1.23823,-10.3742,28.2274)">
<circle cx="31.523" cy="34.314" r="10.021" style="fill:rgb(76,159,255);stroke:rgb(76,159,255);stroke-width:4px;stroke-linecap:butt;"/>
</g>
</g>
</g>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 9.3 KiB

View File

@@ -1,15 +1,12 @@
.. ocrmypdf documentation master file, created by
sphinx-quickstart on Sun Sep 4 14:29:43 2016.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
OCRmyPDF documentation
======================
OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to
be searched.
OCRmyPDF adds an optical character recognition (OCR) text layer to scanned PDF
files, allowing them to be searched.
PDF is the best format for storing and exchanging scanned documents. Unfortunately, PDFs can be difficult to modify. OCRmyPDF makes it easy to apply image processing and OCR to existing PDFs.
PDF is the best format for storing and exchanging scanned documents.
Unfortunately, PDFs can be difficult to modify. OCRmyPDF makes it easy to apply
image processing and OCR to existing PDFs.
.. toctree::
:maxdepth: 1
@@ -17,6 +14,7 @@ PDF is the best format for storing and exchanging scanned documents. Unfortunat
introduction
release_notes
installation
optimizer
languages
jbig2
@@ -28,9 +26,18 @@ PDF is the best format for storing and exchanging scanned documents. Unfortunat
docker
advanced
batch
security
performance
pdfsecurity
errors
.. toctree::
:caption: Developers
:maxdepth: 2
api
plugins
apiref
contributing
Indices and tables
==================

View File

@@ -1,3 +1,4 @@
===================
Installing OCRmyPDF
===================
@@ -7,21 +8,38 @@ Installing OCRmyPDF
|latest|
The easiest way to install OCRmyPDF is to follow the steps for your operating
system/platform, although sometimes this version may be out of date.
system/platform. This version may be out of date, however.
If you want to use the latest version of OCRmyPDF, your best bet is to install
the most recent version your platform provides, and then upgrade that version by
installing the Python binary wheels.
These platforms have one-liner installs:
+-------------------------------+-------------------------------+
| Debian, Ubuntu | ``apt install ocrmypdf`` |
+-------------------------------+-------------------------------+
| Windows Subsystem for Linux | ``apt install ocrmypdf`` |
+-------------------------------+-------------------------------+
| Fedora | ``dnf install ocrmypdf`` |
+-------------------------------+-------------------------------+
| macOS | ``brew install ocrmypdf`` |
+-------------------------------+-------------------------------+
| LinuxBrew | ``brew install ocrmypdf`` |
+-------------------------------+-------------------------------+
| FreeBSD | ``pkg install py37-ocrmypdf`` |
+-------------------------------+-------------------------------+
| Conda (WSL, macOS, Linux) | ``conda install ocrmypdf`` |
+-------------------------------+-------------------------------+
More detailed procedures are outlined below. If you want to do a manual
install, or install a more recent version than your platform provides, read on.
.. contents:: Platform-specific steps
:depth: 2
:local:
Installing on Linux
-------------------
===================
Debian and Ubuntu 16.10 or newer
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Debian and Ubuntu 18.04 or newer
--------------------------------
.. |deb-stable| image:: https://repology.org/badge/version-for-repo/debian_stable/ocrmypdf.svg
:alt: Debian 9 stable ("stretch")
@@ -32,118 +50,178 @@ Debian and Ubuntu 16.10 or newer
.. |deb-unstable| image:: https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg
:alt: Debian unstable
.. |ubu-1710| image:: https://repology.org/badge/version-for-repo/ubuntu_17_10/ocrmypdf.svg
:alt: Ubuntu 17.10
.. |ubu-1804| image:: https://repology.org/badge/version-for-repo/ubuntu_18_04/ocrmypdf.svg
:alt: Ubuntu 18.04 LTS
.. |ubu-1810| image:: https://repology.org/badge/version-for-repo/ubuntu_18_10/ocrmypdf.svg
:alt: Ubuntu 18.10
.. |ubu-2004| image:: https://repology.org/badge/version-for-repo/ubuntu_20_04/ocrmypdf.svg
:alt: Ubuntu 20.04 LTS
.. |ubu-2010| image:: https://repology.org/badge/version-for-repo/ubuntu_20_10/ocrmypdf.svg
:alt: Ubuntu 20.10
+-------------------------------------------+
| **OCRmyPDF versions in Debian & Ubuntu** |
+-------------------------------------------+
| |latest| |
+-------------------------------------------+
| |deb-stable| |deb-testing| |deb-unstable| |
+-------------------------------------------+
| |ubu-1710| |ubu-1804| |ubu-1810| |
+-------------------------------------------+
+-----------------------------------------------+
| **OCRmyPDF versions in Debian & Ubuntu** |
+-----------------------------------------------+
| |latest| |
+-----------------------------------------------+
| |deb-stable| |deb-testing| |deb-unstable| |
+-----------------------------------------------+
| |ubu-1804| |ubu-2004| |ubu-2010| |
+-----------------------------------------------+
Users of Debian 9 ("stretch") or later or Ubuntu 16.10 or later may simply
Users of Debian 9 ("stretch") or later, or Ubuntu 18.04 or later, including users
of Windows Subsystem for Linux, may simply
.. code-block:: bash
apt-get install ocrmypdf
As indicated in the table above, Debian and Ubuntu releases may lag behind the latest version. If the version available for your platform is out of date, you could opt to install the latest version from source. See `Installing HEAD revision from sources`_.
As indicated in the table above, Debian and Ubuntu releases may lag
behind the latest version. If the version available for your platform is
out of date, you could opt to install the latest version from source.
See `Installing HEAD revision from
sources <#installing-head-revision-from-sources>`__. Ubuntu 16.10 to 17.10
inclusive also had ocrmypdf, but these versions are end of life.
For full details on version availability for your platform, check the `Debian Package Tracker <https://tracker.debian.org/pkg/ocrmypdf>`_ or `Ubuntu launchpad.net <https://launchpad.net/ocrmypdf>`_.
For full details on version availability for your platform, check the
`Debian Package Tracker <https://tracker.debian.org/pkg/ocrmypdf>`__ or
`Ubuntu launchpad.net <https://launchpad.net/ocrmypdf>`__.
.. note::
OCRmyPDF for Debian and Ubuntu currently omit the JBIG2 encoder. OCRmyPDF works fine without it but will produce larger output files. If you build jbig2enc from source, ocrmypdf 7.0.0 and later will automatically detect it (specifically the ``jbig2`` binary) on the ``PATH``. To add JBIG2 encoding, see :ref:`jbig2`.
OCRmyPDF for Debian and Ubuntu currently omit the JBIG2 encoder.
OCRmyPDF works fine without it but will produce larger output files.
If you build jbig2enc from source, ocrmypdf 7.0.0 and later will
automatically detect it (specifically the ``jbig2`` binary) on the
``PATH``. To add JBIG2 encoding, see :ref:`jbig2`.
Fedora 29 or newer
^^^^^^^^^^^^^^^^^^
Fedora
------
.. |fedora-29| image:: https://repology.org/badge/version-for-repo/fedora29/ocrmypdf.svg
:alt: Fedora 29
.. |fedora-32| image:: https://repology.org/badge/version-for-repo/fedora_32/ocrmypdf.svg
:alt: Fedora 32
.. |fedora-33| image:: https://repology.org/badge/version-for-repo/fedora_33/ocrmypdf.svg
:alt: Fedora 33
.. |fedora-rawhide| image:: https://repology.org/badge/version-for-repo/fedora_rawhide/ocrmypdf.svg
:alt: Fedore Rawhide
+-----------------------------------------------+
| **OCRmyPDF version** |
+-----------------------------------------------+
| |latest| |
+-----------------------------------------------+
| |fedora-32| |fedora-33| |fedora-rawhide| |
+-----------------------------------------------+
+------------------------------+
| **OCRmyPDF version** |
+------------------------------+
| |latest| |
+------------------------------+
| |fedora-29| |fedora-rawhide| |
+------------------------------+
Users of Fedora 29 later may simply
Users of Fedora 29 or later may simply
.. code-block:: bash
dnf install ocrmypdf
For full details on version availability, check the `Fedora Package Tracker
<https://apps.fedoraproject.org/packages/ocrmypdf>`_.
For full details on version availability, check the `Fedora Package
Tracker <https://apps.fedoraproject.org/packages/ocrmypdf>`__.
If the version available for your platform is out of date, you could opt to
install the latest version from source. See `Installing HEAD revision from
sources`_.
If the version available for your platform is out of date, you could opt
to install the latest version from source. See `Installing HEAD revision
from sources <#installing-head-revision-from-sources>`__.
.. note::
OCRmyPDF for Fedora currently omits the JBIG2 encoder due to patent issues.
OCRmyPDF works fine without it but will produce larger output files. If you
build jbig2enc from source, ocrmypdf 7.0.0 and later will automatically
detect it on the ``PATH``. To add JBIG2 encoding, see `Installing the JBIG2
encoder <jbig2>`_.
OCRmyPDF for Fedora currently omits the JBIG2 encoder due to patent
issues. OCRmyPDF works fine without it but will produce larger output
files. If you build jbig2enc from source, ocrmypdf 7.0.0 and later
will automatically detect it on the ``PATH``. To add JBIG2 encoding,
see `Installing the JBIG2 encoder <jbig2>`__.
Installing the latest version on Ubuntu 18.04 LTS
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. _ubuntu-lts-latest:
Ubuntu 18.04 includes ocrmypdf 6.1.2. To install a more recent version, first
install the system version to get most of the dependencies:
Installing the latest version on Ubuntu 20.04 LTS
-------------------------------------------------
Ubuntu 20.04 includes ocrmypdf 9.6.0 - you can install that with ``apt``. To
install a more recent version, uninstall the system-provided version of
ocrmypdf, and install the following dependencies:
.. code-block:: bash
sudo apt-get update
sudo apt-get install \
ocrmypdf \
python3-pip
sudo apt-get -y remove ocrmypdf # remove system ocrmypdf, if installed
sudo apt-get -y update
sudo apt-get -y install \
ghostscript \
icc-profiles-free \
liblept5 \
libxml2 \
pngquant \
python3-pip \
tesseract-ocr \
zlib1g
There are a few dependency changes between ocrmypdf 6.1.2 and 7.x. Let's get
these, too.
To install ocrmypdf for the system:
.. code-block:: bash
sudo apt-get install \
libexempi3 \
pngquant
pip3 install ocrmypdf
Then install the most recent ocrmypdf for the local user and set the user's ``PATH`` to check for the user's Python packages.
To install for the current user only:
.. code-block:: bash
export PATH=$HOME/.local/bin:$PATH
pip3 install --user ocrmypdf
Ubuntu 18.04 LTS
----------------
Ubuntu 18.04 includes ocrmypdf 6.1.2 - you can install that with ``apt``, but
it is quite old now. To install a more recent version, uninstall the old version
of ocrmypdf, and install the following dependencies:
.. code-block:: bash
sudo apt-get -y remove ocrmypdf
sudo apt-get -y update
sudo apt-get -y install \
ghostscript \
icc-profiles-free \
liblept5 \
libxml2 \
pngquant \
python3-cffi \
python3-distutils \
python3-pkg-resources \
python3-reportlab \
qpdf \
tesseract-ocr \
zlib1g \
unpaper
We will need a newer version of ``pip`` then was available for Ubuntu 18.04:
.. code-block:: bash
wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py
Then install the most recent ocrmypdf for the local user and set the
user's ``PATH`` to check for the user's Python packages.
.. code-block:: bash
export PATH=$HOME/.local/bin:$PATH
python3 -m pip install --user ocrmypdf
To add JBIG2 encoding, see :ref:`jbig2`.
Ubuntu 16.04 LTS
^^^^^^^^^^^^^^^^
----------------
No package is available for Ubuntu 16.04. OCRmyPDF 8.0 and newer require Python
3.6. Ubuntu 16.04 ships Python 3.5, but you can install Python 3.6 on it. Or,
you can skip Python 3.6 and install OCRmyPDF 7.x or older - for that procedure,
please see the installation documentation for the version of OCRmyPDF you plan
to use.
No package is available for Ubuntu 16.04. OCRmyPDF 8.0 and newer require
Python 3.6. Ubuntu 16.04 ships Python 3.5, but you can install Python
3.6 on it. Or, you can skip Python 3.6 and install OCRmyPDF 7.x or older
- for that procedure, please see the installation documentation for the
version of OCRmyPDF you plan to use.
**Install system packages for OCRmyPDF**
@@ -165,13 +243,13 @@ to use.
tesseract-ocr \
unpaper
This will install a Python 3.6 binary at ``/usr/bin/python3.6`` alongside the
system's Python 3.5. Do not remove the system Python. This will also install
Tesseract 4.0 from a PPA, since the version available in Ubuntu 16.04 is too old
for OCRmyPDF.
This will install a Python 3.6 binary at ``/usr/bin/python3.6``
alongside the system's Python 3.5. Do not remove the system Python. This
will also install Tesseract 4.0 from a PPA, since the version available
in Ubuntu 16.04 is too old for OCRmyPDF.
Now install pip for Python 3.6. This will install the Python 3.6 version of
``pip`` at ``/usr/local/bin/pip``.
Now install pip for Python 3.6. This will install the Python 3.6 version
of ``pip`` at ``/usr/local/bin/pip``.
.. code-block:: bash
@@ -179,8 +257,9 @@ Now install pip for Python 3.6. This will install the Python 3.6 version of
**Install OCRmyPDF**
OCRmyPDF requires the locale to be set for UTF-8. **On some minimal Ubuntu
installations systems**, it may be necessary to set the locale.
OCRmyPDF requires the locale to be set for UTF-8. **On some minimal
Ubuntu installations**, such as the Ubuntu 16.04 Docker images it may be
necessary to set the locale.
.. code-block:: bash
@@ -194,111 +273,161 @@ environment variable contains ``$HOME/.local/bin``.
.. code-block:: bash
export PATH=$HOME/.local/bin:$PATH
pip3 install --user ocrmypdf
pip3.6 install --user ocrmypdf
To add JBIG2 encoding, see :ref:`jbig2`.
Ubuntu 14.04 LTS
^^^^^^^^^^^^^^^^
Installing on Ubuntu 14.04 LTS (trusty) is more difficult than some other
options, because of its age. Several backports are required. For explanations of
some steps of this procedure, see the similar steps for Ubuntu 16.04.
Install system dependencies:
.. code-block:: bash
sudo apt-get update
sudo apt-get install \
software-properties-common python-software-properties \
zlib1g-dev \
libexempi3 \
libjpeg-dev \
libffi-dev \
pngquant \
qpdf
We will need backports of Ghostscript 9.16, libav-11 (for unpaper 6.1),
Tesseract 4.00 (alpha), and Python 3.6. This will replace Ghostscript and
Tesseract 3.x on your system. Python 3.6 will be installed alongside the system
Python 3.4.
If you prefer to not modify your system in this matter, consider using a Docker
container.
.. code-block:: bash
sudo add-apt-repository ppa:vshn/ghostscript -y
sudo add-apt-repository ppa:heyarje/libav-11 -y
sudo add-apt-repository ppa:alex-p/tesseract-ocr -y
sudo add-apt-repository ppa:jonathonf/python-3.6 -y
sudo apt-get update
sudo apt-get install \
python3.6-dev \
ghostscript \
tesseract-ocr \
tesseract-ocr-eng \
libavformat56 libavcodec56 libavutil54 \
wget
Now we need to install ``pip`` and let it install ocrmypdf:
.. code-block:: bash
curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && python3.6 -m easy_install pip
pip3.6 install ocrmypdf
These installation instructions omit the optional dependency ``unpaper``, which is only available at version 0.4.2 in Ubuntu 14.04. The author could not find a backport of ``unpaper``, and created a .deb package to do the job of installing unpaper 6.1 (for x86 64-bit only):
.. code-block:: bash
wget -q 'https://www.dropbox.com/s/vaq0kbwi6e6au80/unpaper_6.1-1.deb?raw=1' -O unpaper_6.1-1.deb
sudo dpkg -i unpaper_6.1-1.deb
To add JBIG2 encoding, see :ref:`jbig2`.
ArchLinux (AUR)
^^^^^^^^^^^^^^^
Arch Linux (AUR)
----------------
.. image:: https://repology.org/badge/version-for-repo/aur/ocrmypdf.svg
:alt: ArchLinux
:target: https://repology.org/metapackage/ocrmypdf
There is an `ArchLinux User Repository package for ocrmypdf <https://aur.archlinux.org/packages/ocrmypdf/>`_. You can use the following command.
There is an `Arch User Repository (AUR) package for OCRmyPDF
<https://aur.archlinux.org/packages/ocrmypdf/>`__.
Installing AUR packages as root is not allowed, so you must first `setup a
non-root user
<https://wiki.archlinux.org/index.php/Users_and_groups#User_management>`__ and
`configure sudo <https://wiki.archlinux.org/index.php/Sudo#Configuration>`__.
The standard Docker image, ``archlinux/base:latest``, does **not** have a
non-root user configured, so users of that image must follow these guides. If
you are using a VM image, such as `the official Vagrant image
<https://app.vagrantup.com/archlinux/boxes/archlinux>`__, this work may already
be completed for you.
Next you should install the `base-devel package group
<https://www.archlinux.org/groups/x86_64/base-devel/>`__. This includes the
standard tooling needed to build packages, such as a compiler and binary tools.
.. code-block:: bash
yaourt -S ocrmypdf
sudo pacman -S base-devel
If you have any difficulties with installation, check the repository package page.
Now you are ready to install the OCRmyPDF package.
.. code-block:: bash
curl -O https://aur.archlinux.org/cgit/aur.git/snapshot/ocrmypdf.tar.gz
tar xvzf ocrmypdf.tar.gz
cd ocrmypdf
makepkg -sri
At this point you will have a working install of OCRmyPDF, but the Tesseract
install wont include any OCR language data. You can install `the
tesseract-data package group
<https://www.archlinux.org/groups/any/tesseract-data/>`__ to add all supported
languages, or use that package listing to identify the appropriate package for
your desired language.
.. code-block:: bash
sudo pacman -S tesseract-data-eng
As an alternative to this manual procedure, consider using an `AUR helper
<https://wiki.archlinux.org/index.php/AUR_helpers>`__. Such a tool will
automatically fetch, build and install the AUR package, resolve dependencies
(including dependencies on AUR packages), and ease the upgrade procedure.
If you have any difficulties with installation, check the repository package
page.
.. note::
The OCRmyPDF AUR package currently omits the JBIG2 encoder. OCRmyPDF works
fine without it but will produce larger output files. The encoder is
available from `the jbig2enc-git AUR package
<https://aur.archlinux.org/packages/jbig2enc-git/>`__ and may be installed
using the same series of steps as for the installation OCRmyPDF AUR
package. Alternatively, it may be built manually from source following the
instructions in `Installing the JBIG2 encoder <jbig2>`__. If JBIG2 is
installed, OCRmyPDF 7.0.0 and later will automatically detect it.
Alpine Linux
------------
.. image:: https://repology.org/badge/version-for-repo/alpine_edge/ocrmypdf.svg
:alt: Alpine Linux
:target: https://repology.org/metapackage/ocrmypdf
To install OCRmyPDF for Alpine Linux:
.. code-block:: bash
apk add ocrmypdf
Mageia 7
--------
There is no OS-level packaging available for Mageia, so you must install the
dependencies:
.. code-block:: bash
# As root user
urpmi.update -a
urpmi \
ghostscript \
icc-profiles-openicc \
jbig2dec \
lib64leptonica5 \
pngquant \
python3-pip \
python3-cffi \
python3-distutils-extra \
python3-pkg-resources \
python3-reportlab \
qpdf \
tesseract \
tesseract-osd \
tesseract-eng \
tesseract-fra
To install ocrmypdf for the system:
.. code-block:: bash
# As root user
pip3 install ocrmypdf
ldconfig
Or, to install for the current user only:
.. code-block:: bash
export PATH=$HOME/.local/bin:$PATH
pip3 install --user ocrmypdf
Other Linux packages
^^^^^^^^^^^^^^^^^^^^
--------------------
See the `Repology <https://repology.org/metapackage/ocrmypdf/versions>`_ page.
See the
`Repology <https://repology.org/metapackage/ocrmypdf/versions>`__ page.
In general, first install the OCRmyPDF package for your system, then optionally use the procedure `Installing with Python pip`_ to install a more recent version.
In general, first install the OCRmyPDF package for your system, then
optionally use the procedure `Installing with Python
pip <#installing-with-python-pip>`__ to install a more recent version.
Installing on macOS
-------------------
===================
Homebrew
^^^^^^^^
--------
.. image:: https://img.shields.io/homebrew/v/ocrmypdf.svg
:alt: homebrew
:target: http://brewformulas.org/Ocrmypdf
OCRmyPDF is now a standard `Homebrew <https://brew.sh>`_ formula. To install on macOS:
OCRmyPDF is now a standard `Homebrew <https://brew.sh>`__ formula. To
install on macOS:
.. code-block:: bash
brew install ocrmypdf
This will include only the English language pack. If you need other languages you can optionally install them all:
This will include only the English language pack. If you need other
languages you can optionally install them all:
.. code-block:: bash
@@ -306,18 +435,26 @@ This will include only the English language pack. If you need other languages yo
.. note::
Users who previously installed OCRmyPDF on macOS using ``pip install ocrmypdf`` should remove the pip version (``pip3 uninstall ocrmypdf``) before switching to the Homebrew version.
Users who previously installed OCRmyPDF on macOS using
``pip install ocrmypdf`` should remove the pip version
(``pip3 uninstall ocrmypdf``) before switching to the Homebrew
version.
.. note::
Users who previously installed OCRmyPDF from the private tap should switch to the mainline version (``brew untap jbarlow83/ocrmypdf``) and install from there.
Users who previously installed OCRmyPDF from the private tap should
switch to the mainline version (``brew untap jbarlow83/ocrmypdf``)
and install from there.
Manual installation on macOS
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
----------------------------
These instructions probably work on all macOS supported by Homebrew.
These instructions probably work on all macOS supported by Homebrew, and are
for installing a more current version of OCRmyPDF than is available from
Homebrew. Note that the Homebrew versions usually track the release versions
fairly closely.
If it's not already present, `install Homebrew <http://brew.sh/>`_.
If it's not already present, `install Homebrew <http://brew.sh/>`__.
Update Homebrew:
@@ -325,20 +462,18 @@ Update Homebrew:
brew update
Install or upgrade the required Homebrew packages, if any are missing. To do this, download the ``Brewfile`` that lists all of the dependencies to the current directory, and run ``brew bundle`` to process them (installing or upgrading as needed). ``Brewfile`` is a plain text file.
Install or upgrade the required Homebrew packages, if any are missing.
To do this, use ``brew edit ocrmypdf`` to obtain a recent list of Homebrew
dependencies. You could also check the ``.workflows/build.yml``.
.. code-block:: bash
wget https://github.com/jbarlow83/OCRmyPDF/raw/master/.travis/Brewfile
brew bundle
This will include the English, French, German and Spanish language packs. If you need other languages you can optionally install them all:
This will include the English, French, German and Spanish language
packs. If you need other languages you can optionally install them all:
.. _macos-all-languages:
.. code-block:: bash
.. code-block:: bash
brew install tesseract --with-all-languages # Option 2: for all language packs
brew install tesseract-lang # Option 2: for all language packs
Update the homebrew pip:
@@ -364,94 +499,285 @@ The command line program should now be available:
ocrmypdf --help
Installing the Docker image
Installing on Windows
=====================
Native Windows
--------------
.. note::
Administrator privileges will be required for some of these steps.
You must install the following for Windows:
* Python 3.7 (64-bit) or later
* Tesseract 4.0 or later
* Ghostscript 9.50 or later
Using the `Chocolatey <https://chocolatey.org/>`_ package manager, install the
following when running in an Administrator command prompt:
* ``choco install python3``
* ``choco install --pre tesseract``
* ``choco install ghostscript``
* ``choco install pngquant`` (optional)
The commands above will install Python 3.x (latest version), Tesseract, Ghostscript
and pngquant. Chocolatey may also need to install the Windows Visual C++ Runtime
DLLs or other Windows patches, and may require a reboot.
You may then use ``pip`` to install ocrmypdf. (This can performed by a user or
Administrator.):
* ``pip install ocrmypdf``
Chocolatey automatically selects appropriate versions of these applications. If you
are installing them manually, please install 64-bit versions of all applications for
64-bit Windows, or 32-bit versions of all applications for 32-bit Windows. Mixing
the "bitness" of these programs will lead to errors.
OCRmyPDF will check the Windows Registry and standard locations in your Program Files
for third party software it needs (specifically, Tesseract and Ghostscript). To
override the versions OCRmyPDF selects, you can modify the ``PATH`` environment
variable. `Follow these directions <https://www.computerhope.com/issues/ch000549.htm#dospath>`_
to change the PATH.
.. warning::
As of early 2021, users have reported problems with the Microsoft Store version of
Python and OCRmyPDF. These issues affect many other third party Python packages.
Please download Python from Python.org or Chocolatey instead, and do not use the
Microsoft Store version.
Windows Subsystem for Linux
---------------------------
For some users, installing the Docker image will be easier than installing all of OCRmyPDF's dependencies. For Windows, it is the only option.
#. Install Ubuntu 18.04 for Windows Subsystem for Linux, if not already installed.
#. Follow the procedure to install :ref:`OCRmyPDF on Ubuntu 18.04 <ubuntu-lts-latest>`.
#. Open the Windows command prompt and create a symlink:
See `OCRmyPDF Docker Image <docker>`_ for more information.
.. code-block:: powershell
Installing on Windows
---------------------
wsl sudo ln -s /home/$USER/.local/bin/ocrmypdf /usr/local/bin/ocrmypdf
Direct installation on Windows is not possible. `Install the Docker <docker-install>`_ container as described above. Ensure that your command prompt can run the docker "hello world" container.
Then confirm that the expected version from PyPI (|latest|) is installed:
It would probably not be too difficult to port on Windows. The main reason this has been avoided is the difficulty of packaging and installing the various non-Python dependencies: Tesseract, QPDF, Ghostscript, Leptonica. Pull requests to add or improve Windows support would be quite welcome.
.. code-block:: powershell
wsl ocrmypdf --version
You can then run OCRmyPDF in the Windows command prompt or Powershell, prefixing
``wsl``, and call it from Windows programs or batch files.
Cygwin64
--------
First install the the following prerequisite Cygwin packages using ``setup-x86_64.exe``::
python36 (or later)
python3?-devel
python3?-pip
python3?-lxml
python3?-imaging
(where 3? means match the version of python3 you installed)
gcc-g++
ghostscript (<=9.50 or >=9.52-2 see note below)
libexempi3
libexempi-devel
libffi6
libffi-devel
pngquant
qpdf
libqpdf-devel
tesseract-ocr
tesseract-ocr-devel
.. note::
The Cygwin package for Ghostscript in versions 9.52 and
9.52-1 contained a bug that caused an exception to occur when
ocrmypdf invoked gs. Make sure you have either 9.50 (or earlier)
or 9.52-2 (or later).
Then open a Cygwin terminal (i.e. ``mintty``), run the following commands. Note
that if you are using the version of ``pip`` that was installed with the Cygwin
Python package, the command name will be ``pip3``. If you have since updated
``pip`` (with, for instance ``pip3 install --upgrade pip``) the the command is
likely just ``pip`` instead of ``pip3``:
.. code-block:: bash
pip3 install wheel
pip3 install ocrmypdf
The optional dependency "unpaper" that is currently not available under Cygwin.
Without it, certain options such as ``--clean`` will produce an error message.
However, the OCR-to-text-layer functionality is available.
Docker
------
You can also :ref:`Install the Docker <docker>` container on Windows. Ensure that
your command prompt can run the docker "hello world" container.
Installing on FreeBSD
=====================
.. image:: https://repology.org/badge/version-for-repo/freebsd/python:ocrmypdf.svg
:alt: FreeBSD
:target: https://repology.org/project/python:ocrmypdf/versions
FreeBSD 11.3, 12.0, 12.1-RELEASE and 13.0-CURRENT are supported. Other
versions likely work but have not been tested.
.. code-block:: bash
pkg install py37-ocrmypdf
To install a more recent version, you could attempt to first install the system
version with ``pkg``, then use ``pip install --user ocrmypdf``.
Installing the Docker image
===========================
For some users, installing the Docker image will be easier than
installing all of OCRmyPDF's dependencies.
See :ref:`docker` for more information.
Installing with Python pip
--------------------------
==========================
OCRmyPDF is delivered by PyPI because it is a convenient way to install the latest version. However, PyPI and ``pip`` cannot address the fact that ``ocrmypdf`` depends on certain non-Python system libraries and programs being instsalled.
OCRmyPDF is delivered by PyPI because it is a convenient way to install
the latest version. However, PyPI and ``pip`` cannot address the fact
that ``ocrmypdf`` depends on certain non-Python system libraries and
programs being installed.
For best results, first install `your platform's version <https://repology.org/metapackage/ocrmypdf/versions>`_ of ``ocrmypdf``, using the instructions elsewhere in this document. Then you can use ``pip`` to get the latest version if your platform version is out of date. Chances are that this will satisfy most dependencies.
.. warning::
Debian and Ubuntu users: unfortunately, Debian and Ubuntu customize
Python in non-standard ways, and the nature of these customizations
varies from release to release. This can make for a frustrating
user experience. The instructions below work on almost all platforms that
have Python installed, except for Debian and Ubuntu, where you may need
to take additional steps. For best results on Debian and Ubuntu, use the
``apt`` packages; or if these are too old, run
``apt install python3-pip python3-venv``, create a virtual environment,
and install OCRmyPDF in that environment.
`See here for more inforation on Debian-Python issues
<https://gist.github.com/tiran/2dec9e03c6f901814f6d1e8dad09528e>`__.
For best results, first install `your platform's
version <https://repology.org/metapackage/ocrmypdf/versions>`__ of
``ocrmypdf``, using the instructions elsewhere in this document. Then
you can use ``pip`` to get the latest version if your platform version
is out of date. Chances are that this will satisfy most dependencies.
Use ``ocrmypdf --version`` to confirm what version was installed.
Then you can install the latest OCRmyPDF from the Python wheels. First try:
Then you can install the latest OCRmyPDF from the Python wheels. First
try:
.. code-block:: bash
pip3 install --user ocrmypdf
You should then be able to run ``ocrmypdf --version`` and see that the latest version was located.
You should then be able to run ``ocrmypdf --version`` and see that the
latest version was located.
Since ``pip3 install --user`` does not work correctly on some platforms, notably Ubuntu 16.04 and older, and the Homebrew version of Python, instead use this for a system wide installation:
Since ``pip3 install --user`` does not work correctly on some platforms,
notably Ubuntu 16.04 and older, and the Homebrew version of Python,
instead use this for a system wide installation:
.. code-block:: bash
pip3 install ocrmypdf
.. note::
AArch64 (ARM64) users: this process will be difficult because most
Python packages are not available as binary wheels for your platform.
You're probably better off using a platform install on Debian, Ubuntu,
or Fedora.
Requirements for pip and HEAD install
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-------------------------------------
OCRmyPDF currently requires these external programs and libraries to be installed, and must be satisfied using the operating system package manager. ``pip`` cannot provide them.
OCRmyPDF currently requires these external programs and libraries to be
installed, and must be satisfied using the operating system package
manager. ``pip`` cannot provide them.
- Python 3.6 or newer
- Ghostscript 9.15 or newer
- qpdf 8.1.0 or newer
- Tesseract 4.0.0-alpha or newer
- Python 3.6 or newer
- Ghostscript 9.15 or newer
- qpdf 8.1.0 or newer
- Tesseract 4.0.0-beta or newer
As of ocrmypdf 7.2.1, the following versions are recommended:
- Python 3.7
- Ghostscript 9.23 or newer
- qpdf 8.2.1
- Tesseract 4.0.0 or newer
- jbig2enc 0.29 or newer
- pngquant 2.5 or newer
- unpaper 6.1
- Python 3.7 or 3.8
- Ghostscript 9.23 or newer
- qpdf 8.2.1
- Tesseract 4.0.0 or newer
- jbig2enc 0.29 or newer
- pngquant 2.5 or newer
- unpaper 6.1
jbig2enc, pngquant, and unpaper are optional. If missing certain features are disabled. OCRmyPDF will discover them as soon as they are available.
jbig2enc, pngquant, and unpaper are optional. If missing certain
features are disabled. OCRmyPDF will discover them as soon as they are
available.
**jbig2enc**, if present, will be used to optimize the encoding of monochrome images. This can significantly reduce the file size of the output file. It is not required. `jbig2enc <https://github.com/agl/jbig2enc>`_ is not generally available for Ubuntu or Debian due to lingering concerns about patent issues, but can easily be built from source. To add JBIG2 encoding, see :ref:`jbig2`.
**jbig2enc**, if present, will be used to optimize the encoding of
monochrome images. This can significantly reduce the file size of the
output file. It is not required.
`jbig2enc <https://github.com/agl/jbig2enc>`__ is not generally
available for Ubuntu or Debian due to lingering concerns about patent
issues, but can easily be built from source. To add JBIG2 encoding, see
:ref:`jbig2`.
**pngquant**, if present, is optionally used to optimize the encoding of PNG-style images in PDFs (actually, any that are that losslessly encoded) by lossily quantizing to a smaller color palette. It is only activated then the ``--optimize`` argument is ``2`` or ``3``.
**pngquant**, if present, is optionally used to optimize the encoding of
PNG-style images in PDFs (actually, any that are that losslessly
encoded) by lossily quantizing to a smaller color palette. It is only
activated then the ``--optimize`` argument is ``2`` or ``3``.
**unpaper**, if present, enables the ``--clean`` and ``--clean-final`` command line options.
These are in addition to the Python packaging dependencies, meaning that unfortunately, the ``pip install`` command cannot satisfy all of them.
**unpaper**, if present, enables the ``--clean`` and ``--clean-final``
command line options.
These are in addition to the Python packaging dependencies, meaning that
unfortunately, the ``pip install`` command cannot satisfy all of them.
Installing HEAD revision from sources
-------------------------------------
=====================================
If you have ``git`` and Python 3.6 or newer installed, you can install from source. When the ``pip`` installer runs, it will alert you if dependencies are missing.
If you have ``git`` and Python 3.6 or newer installed, you can install
from source. When the ``pip`` installer runs, it will alert you if
dependencies are missing.
If you prefer to build every from source, you will need to `build pikepdf from source <https://pikepdf.readthedocs.io/en/latest/installation.html#building-from-source>`_. First ensure you can build and install pikepdf.
If you prefer to build every from source, you will need to `build
pikepdf from
source <https://pikepdf.readthedocs.io/en/latest/installation.html#building-from-source>`__.
First ensure you can build and install pikepdf.
To install the HEAD revision from sources in the current Python 3 environment:
To install the HEAD revision from sources in the current Python 3
environment:
.. code-block:: bash
pip3 install git+https://github.com/jbarlow83/OCRmyPDF.git
Or, to install in `development mode <https://pythonhosted.org/setuptools/setuptools.html#development-mode>`_, allowing customization of OCRmyPDF, use the ``-e`` flag:
Or, to install in `development
mode <https://pythonhosted.org/setuptools/setuptools.html#development-mode>`__,
allowing customization of OCRmyPDF, use the ``-e`` flag:
.. code-block:: bash
pip3 install -e git+https://github.com/jbarlow83/OCRmyPDF.git
You may find it easiest to install in a virtual environment, rather than system-wide:
You may find it easiest to install in a virtual environment, rather than
system-wide:
.. code-block:: bash
@@ -461,8 +787,8 @@ You may find it easiest to install in a virtual environment, rather than system-
cd OCRmyPDF
pip3 install .
However, ``ocrmypdf`` will only be accessible on the system PATH
when you activate the virtual environment.
However, ``ocrmypdf`` will only be accessible on the system PATH when
you activate the virtual environment.
To run the program:
@@ -476,7 +802,7 @@ dependencies. Older version than the ones mentioned in the release notes
are likely not to be compatible to OCRmyPDF.
For development
^^^^^^^^^^^^^^^
---------------
To install all of the development and test requirements:
@@ -492,15 +818,17 @@ To install all of the development and test requirements:
To add JBIG2 encoding, see :ref:`jbig2`.
Shell completions
-----------------
=================
Completions for ``bash`` and ``fish`` are available in the project's
``misc/completion`` folder. The ``bash`` completions are likely ``zsh``
compatible but this has not been confirmed. Package maintainers, please install
these at the appropriate locations for your system.
compatible but this has not been confirmed. Package maintainers, please
install these at the appropriate locations for your system.
To manually install the ``bash`` completion, copy ``misc/completion/ocrmypdf.bash`` to
``/etc/bash_completion.d/ocrmypdf`` (rename the file).
To manually install the ``bash`` completion, copy
``misc/completion/ocrmypdf.bash`` to ``/etc/bash_completion.d/ocrmypdf``
(rename the file).
To manually install the ``fish`` completion, copy ``misc/completion/ocrmypdf.fish`` to
To manually install the ``fish`` completion, copy
``misc/completion/ocrmypdf.fish`` to
``~/.config/fish/completions/ocrmypdf.fish``.

View File

@@ -1,119 +1,233 @@
============
Introduction
============
OCRmyPDF is a Python 3 package that adds OCR layers to PDFs.
OCRmyPDF is a Python 3 application and library that adds OCR layers to PDFs.
About OCR
---------
=========
`Optical character recognition <https://en.wikipedia.org/wiki/Optical_character_recognition>`_ is technology that converts images of typed or handwritten text, such as in a scanned document, to computer text that can be searched and copied.
`Optical character
recognition <https://en.wikipedia.org/wiki/Optical_character_recognition>`__
is technology that converts images of typed or handwritten text, such as
in a scanned document, to computer text that can be selected, searched and copied.
OCRmyPDF uses `Tesseract <https://github.com/tesseract-ocr/tesseract>`_, the best available open source OCR engine, to perform OCR.
OCRmyPDF uses
`Tesseract <https://github.com/tesseract-ocr/tesseract>`__, the best
available open source OCR engine, to perform OCR.
.. _raster-vector:
About PDFs
----------
==========
PDFs are page description files that attempts to preserve a layout exactly. They contain `vector graphics <http://vector-conversions.com/vectorizing/raster_vs_vector.html>`_ that can contain raster objects such as scanned images. Because PDFs can contain multiple pages (unlike many image formats) and can contain fonts and text, it is a good formats for exchanging scanned documents.
PDFs are page description files that attempts to preserve a layout
exactly. They contain `vector
graphics <http://vector-conversions.com/vectorizing/raster_vs_vector.html>`__
that can contain raster objects such as scanned images. Because PDFs can
contain multiple pages (unlike many image formats) and can contain fonts
and text, it is a good formats for exchanging scanned documents.
.. image:: images/bitmap_vs_svg.svg
|image|
A PDF page might contain multiple images, even if it only appears to have one image. Some scanners or scanning software will segment pages into monochromatic text and color regions for example, to improve the compression ratio and appearance of the page.
Rasterizing a PDF is the process of generating an image suitable for display or analyzing with an OCR engine. OCR engines like Tesseract work with images, not vector objects.
A PDF page might contain multiple images, even if it only appears to
have one image. Some scanners or scanning software will segment pages
into monochromatic text and color regions for example, to improve the
compression ratio and appearance of the page.
Rasterizing a PDF is the process of generating an image suitable for
display or analyzing with an OCR engine. OCR engines like Tesseract work
with images, not vector objects.
About PDF/A
-----------
===========
`PDF/A <https://en.wikipedia.org/wiki/PDF/A>`_ is an ISO-standardized subset of the full PDF specification that is designed for archiving (the 'A' stands for Archive). PDF/A differs from PDF primarily by omitting features that would make it difficult to read the file in the future, such as embedded Javascript, video, audio and references to external fonts. All fonts and resources needed to interpret the PDF must be contained within it. Because PDF/A disables Javascript and other types of embedded content, it is probably more secure.
`PDF/A <https://en.wikipedia.org/wiki/PDF/A>`__ is an ISO-standardized
subset of the full PDF specification that is designed for archiving (the
'A' stands for Archive). PDF/A differs from PDF primarily by omitting
features that would make it difficult to read the file in the future,
such as embedded Javascript, video, audio and references to external
fonts. All fonts and resources needed to interpret the PDF must be
contained within it. Because PDF/A disables Javascript and other types
of embedded content, it is probably more secure.
There are various conformance levels and versions, such as "PDF/A-2b".
Generally speaking, the best format for scanned documents is PDF/A. Some governments and jurisdictions, US Courts in particular, `mandate the use of PDF/A <https://pdfblog.com/2012/02/13/what-is-pdfa/>`_ for scanned documents.
Generally speaking, the best format for scanned documents is PDF/A. Some
governments and jurisdictions, US Courts in particular, `mandate the use
of PDF/A <https://pdfblog.com/2012/02/13/what-is-pdfa/>`__ for scanned
documents.
Since most people who scan documents are interested in reading them indefinitely into the future, OCRmyPDF generates PDF/A-2b by default.
PDF/A has a few drawbacks. Some PDF viewers include an alert that the file is a PDF/A, which may confuse some users. It also tends to produce larger files than PDF, because it embeds certain resources even if they are commonly available. PDF/A files can be digitally signed, but may not be encrypted, to ensure they can be read in the future. Fortunately, converting from PDF/A to a regular PDF is trivial, and any PDF viewer can view PDF/A.
Since most people who scan documents are interested in reading them
indefinitely into the future, OCRmyPDF generates PDF/A-2b by default.
PDF/A has a few drawbacks. Some PDF viewers include an alert that the
file is a PDF/A, which may confuse some users. It also tends to produce
larger files than PDF, because it embeds certain resources even if they
are commonly available. PDF/A files can be digitally signed, but may not
be encrypted, to ensure they can be read in the future. Fortunately,
converting from PDF/A to a regular PDF is trivial, and any PDF viewer
can view PDF/A.
What OCRmyPDF does
------------------
==================
OCRmyPDF analyzes each page of a PDF to determine the colorspace and resolution (DPI) needed to capture all of the information on that page without losing content. It uses `Ghostscript <http://ghostscript.com/>`_ to rasterize the page, and then performs on OCR on the rasterized image to create an OCR "layer". The layer is then grafted back onto the original PDF.
OCRmyPDF analyzes each page of a PDF to determine the colorspace and
resolution (DPI) needed to capture all of the information on that page
without losing content. It uses
`Ghostscript <http://ghostscript.com/>`__ to rasterize the page, and
then performs on OCR on the rasterized image to create an OCR "layer".
The layer is then grafted back onto the original PDF.
While one can use a program like Ghostscript or ImageMagick to get an image and put the image through Tesseract, that actually creates a new PDF and many details may be lost. OCRmyPDF can produce a minimally changed PDF as output.
While one can use a program like Ghostscript or ImageMagick to get an
image and put the image through Tesseract, that actually creates a new
PDF and many details may be lost. OCRmyPDF can produce a minimally
changed PDF as output.
OCRmyPDF also some image processing options like deskew which improve the appearance of files and quality of OCR. When these are used, the OCR layer is grafted onto the processed image instead.
By default, OCRmyPDF produces archival PDFs PDF/A, which are a stricter subset of PDF features designed for long term archives. If regular PDFs are desired, this can be disabled with ``--output-type pdf``.
OCRmyPDF also some image processing options like deskew which improve
the appearance of files and quality of OCR. When these are used, the OCR
layer is grafted onto the processed image instead.
By default, OCRmyPDF produces archival PDFs PDF/A, which are a
stricter subset of PDF features designed for long term archives. If
regular PDFs are desired, this can be disabled with
``--output-type pdf``.
Why you shouldn't do this manually
----------------------------------
==================================
A PDF is similar to an HTML file, in that it contains document structure along with images. Sometimes a PDF does nothing more than present a full page image, but often there is additional content that would be lost.
A PDF is similar to an HTML file, in that it contains document structure
along with images. Sometimes a PDF does nothing more than present a full
page image, but often there is additional content that would be lost.
A manual process could work like either of these:
1. Rasterize each page as an image, OCR the images, and combine the output into a PDF. This preserves the layout of each page, but resamples all images (possibly losing quality, increasing file size, introducing compression artifacts, etc.).
1. Rasterize each page as an image, OCR the images, and combine the
output into a PDF. This preserves the layout of each page, but
resamples all images (possibly losing quality, increasing file size,
introducing compression artifacts, etc.).
2. Extract each image, OCR, and combine the output into a PDF. This
loses the context in which images are used in the PDF, meaning that
cropping, rotation and scaling of pages may be lost. Some scanned
PDFs use multiple images segmented into black and white, grayscale
and color regions, with stencil masks to prevent overlap, as this can
enhance the appearance of a file while reducing file size. Clearly,
reassembling these images will be easy. This also loses and text or
vector art on any pages in a PDF with both scanned and pure digital
content.
2. Extract each image, OCR, and combine the output into a PDF. This loses the context in which images are used in the PDF, meaning that cropping, rotation and scaling of pages may be lost. Some scanned PDFs use multiple images segmented into black and white, grayscale and color regions, with stencil masks to prevent overlap, as this can enhance the appearance of a file while reducing file size. Clearly, reassembling these images will be easy. This also loses and text or vector art on any pages in a PDF with both scanned and pure digital content.
In the case of a PDF that is nothing other than a container of images
(no rotation, scaling, cropping, one image per page), the second
approach can be lossless.
In the case of a PDF that is nothing other than a container of images (no rotation, scaling, cropping, one image per page), the second approach can be lossless.
OCRmyPDF uses several strategies depending on input options and the input PDF itself, but generally speaking it rasterizes a page for OCR and then grafts the OCR back onto the original. As such it can handle complex PDFs and still preserve their contents as much as possible.
OCRmyPDF also supports a many, many edge cases that have cropped over several years of development. We support PDF features like images inside of Form XObjects, and pages with UserUnit scaling. We support rare image formats like non-monochrome 1-bit images. We warn about files you may not to OCR. Thanks to pikepdf and QPDF, we auto-repair PDFs that are damaged. (Not that you need to know what any of these are! You should be able to throw any PDF at it.)
OCRmyPDF uses several strategies depending on input options and the
input PDF itself, but generally speaking it rasterizes a page for OCR
and then grafts the OCR back onto the original. As such it can handle
complex PDFs and still preserve their contents as much as possible.
OCRmyPDF also supports a many, many edge cases that have cropped over
several years of development. We support PDF features like images inside
of Form XObjects, and pages with UserUnit scaling. We support rare image
formats like non-monochrome 1-bit images. We warn about files you may
not to OCR. Thanks to pikepdf and QPDF, we auto-repair PDFs that are
damaged. (Not that you need to know what any of these are! You should be
able to throw any PDF at it.)
Limitations
-----------
===========
OCRmyPDF is limited by the Tesseract OCR engine. As such it experiences these limitations, as do any other programs that rely on Tesseract:
OCRmyPDF is limited by the Tesseract OCR engine. As such it experiences
these limitations, as do any other programs that rely on Tesseract:
* The OCR is not as accurate as commercial solutions such as Abbyy.
* It is not capable of recognizing handwriting.
* It may find gibberish and report this as OCR output.
* If a document contains languages outside of those given in the ``-l LANG`` arguments, results may be poor.
* It is not always good at analyzing the natural reading order of documents. For example, it may fail to recognize that a document contains two columns, and may try to join text across columns.
* Poor quality scans may produce poor quality OCR. Garbage in, garbage out.
* It does not expose information about what font family text belongs to.
- The OCR is not as accurate as commercial solutions such as Abbyy.
- It is not capable of recognizing handwriting.
- It may find gibberish and report this as OCR output.
- If a document contains languages outside of those given in the
``-l LANG`` arguments, results may be poor.
- It is not always good at analyzing the natural reading order of
documents. For example, it may fail to recognize that a document
contains two columns, and may try to join text across columns.
- Poor quality scans may produce poor quality OCR. Garbage in, garbage
out.
- It does not expose information about what font family text belongs
to.
OCRmyPDF is also limited by the PDF specification:
* PDF encodes the position of text glyphs but does not encode document structure. There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically. Some PDF viewers do a better job of this than others.
* Because some popular open source PDF viewers have a particularly hard time with spaces betweem words, OCRmyPDF appends a space to each text element as a workaround (when using ``--pdf-renderer hocr``). While this mixes document structure with graphical information that ideally should be left to the PDF viewer to interpret, it improves compatibility with some viewers and does not cause problems for better ones.
- PDF encodes the position of text glyphs but does not encode document
structure. There is no markup that divides a document in sections,
paragraphs, sentences, or even words (since blank spaces are not
represented). As such all elements of document structure including
the spaces between words must be derived heuristically. Some PDF
viewers do a better job of this than others.
- Because some popular open source PDF viewers have a particularly hard
time with spaces between words, OCRmyPDF appends a space to each text
element as a workaround (when using ``--pdf-renderer hocr``). While
this mixes document structure with graphical information that ideally
should be left to the PDF viewer to interpret, it improves
compatibility with some viewers and does not cause problems for
better ones.
Ghostscript also imposes some limitations:
* PDFs containing JBIG2-encoded content will be converted to CCITT Group4 encoding, which has lower compression ratios, if Ghostscript PDF/A is enabled.
* PDFs containing JPEG 2000-encoded content will be converted to JPEG encoding, which may introduce compression artifacts, if Ghostscript PDF/A is enabled.
* Ghostscript may transcode grayscale and color images, either lossy to lossless or lossless to lossy, based on an internal algorithm. This behavior can be suppressed by setting ``--pdfa-image-compression`` to ``jpeg`` or ``lossless`` to set all images to one type or the other. Ghostscript has no option to maintain the input image's format. (Ghostscript 9.25+ can copy JPEG images without transcoding them; earlier versions will transcode.)
* Ghostscript's PDF/A conversion removes any XMP metadata that is not one of the standard XMP metadata namespaces for PDFs. In particular, PRISM Metdata is removed.
- PDFs containing JBIG2-encoded content will be converted to CCITT
Group4 encoding, which has lower compression ratios, if Ghostscript
PDF/A is enabled.
- PDFs containing JPEG 2000-encoded content will be converted to JPEG
encoding, which may introduce compression artifacts, if Ghostscript
PDF/A is enabled.
- Ghostscript may transcode grayscale and color images, either lossy to
lossless or lossless to lossy, based on an internal algorithm. This
behavior can be suppressed by setting ``--pdfa-image-compression`` to
``jpeg`` or ``lossless`` to set all images to one type or the other.
Ghostscript has no option to maintain the input image's format.
(Ghostscript 9.25+ can copy JPEG images without transcoding them;
earlier versions will transcode.)
- Ghostscript's PDF/A conversion removes any XMP metadata that is not
one of the standard XMP metadata namespaces for PDFs. In particular,
PRISM Metdata is removed.
- Ghostscript's PDF/A conversion seems to remove or deactivate
hyperlinks and other active content.
You can use ``--output-type pdf`` to disable PDF/A conversion and produce
a standard, non-archival PDF.
Regarding OCRmyPDF itself:
* PDFs that use transparency are not currently represented in the test suite
* The Python API exported by ``import ocrmypdf`` is design to help scripts that use OCRmyPDF but is not currently capable of running OCRmyPDF jobs due to limitations in an underlying library.
- PDFs that use transparency are not currently represented in the test
suite
Similar programs
----------------
================
To the author's knowledge, OCRmyPDF is the most feature-rich and thoroughly tested command line OCR PDF conversion tool. If it does not meet your needs, contributions and suggestions are welcome. If not, consider one of these similar open source programs:
To the author's knowledge, OCRmyPDF is the most feature-rich and
thoroughly tested command line OCR PDF conversion tool. If it does not
meet your needs, contributions and suggestions are welcome. If not,
consider one of these similar open source programs:
* pdf2pdfocr
* pdfsandwich
* pypdfocr
* pdfbeads
- pdf2pdfocr
- pdfsandwich
- pypdfocr
- pdfbeads
Web front-ends
--------------
==============
The Docker image ``ocrmypdf-alpine`` provides a web service front-end that allows files to submitted over HTTP and the results "downloaded". This is an HTTP server intended to simplify web services deployments; it is not intended to be deployed on the public internet and no real security measures to speak of.
The Docker image ``ocrmypdf`` provides a web service front-end
that allows files to submitted over HTTP and the results "downloaded".
This is an HTTP server intended to simplify web services deployments; it
is not intended to be deployed on the public internet and no real
security measures to speak of.
In addition, the following third-party integrations are available:
* `Nextcloud OCR <https://github.com/janis91/ocr>`_ is a free software plugin for the Nextcloud private cloud software
- `Nextcloud OCR <https://github.com/janis91/ocr>`__ is a free software
plugin for the Nextcloud private cloud software
OCRmyPDF is not designed to be secure against malware-bearing PDFs (see `Using OCRmyPDF online <ocr-service>`_). Users should ensure they comply with OCRmyPDF's licenses and the licenses of all dependencies. In particular, OCRmyPDF requires Ghostscript, which is licensed under AGPLv3.
OCRmyPDF is not designed to be secure against malware-bearing PDFs (see
`Using OCRmyPDF online <ocr-service>`__). Users should ensure they
comply with OCRmyPDF's licenses and the licenses of all dependencies. In
particular, OCRmyPDF requires Ghostscript, which is licensed under
AGPLv3.
.. |image| image:: images/bitmap_vs_svg.svg

View File

@@ -1,35 +1,55 @@
.. _jbig2:
============================
Installing the JBIG2 encoder
============================
Most Linux distributions do not include a JBIG2 encoder since JBIG2 encoding was patented for a long time. All known JBIG2 US patents have expired as of 2017, but it is possible that unknown patents exist.
Most Linux distributions do not include a JBIG2 encoder since JBIG2
encoding was patented for a long time. All known JBIG2 US patents have
expired as of 2017, but it is possible that unknown patents exist.
JBIG2 encoding is recommended for OCRmyPDF and is used to losslessly create smaller PDFs. If JBIG2 encoding not available, lower quality encodings will be used.
JBIG2 encoding is recommended for OCRmyPDF and is used to losslessly
create smaller PDFs. If JBIG2 encoding not available, lower quality
encodings will be used.
JBIG2 decoding is not patented and is performed automatically by most PDF viewers. It is widely supported has been part of the PDF specification since 2001.
JBIG2 decoding is not patented and is performed automatically by most
PDF viewers. It is widely supported has been part of the PDF
specification since 2001.
On macOS, Homebrew packages jbig2enc and OCRmyPDF includes it by default. The Docker image for OCRmyPDF also builds its own JBIG2 encoder from source.
On macOS, Homebrew packages jbig2enc and OCRmyPDF includes it by
default. The Docker image for OCRmyPDF also builds its own JBIG2 encoder
from source.
For all other Linux, you must build a JBIG2 encoder from source:
.. code-block:: bash
git clone https://github.com/agl/jbig2enc
cd jbig2enc
./autogen.sh
./configure && make
[sudo] make install
git clone https://github.com/agl/jbig2enc
cd jbig2enc
./autogen.sh
./configure && make
[sudo] make install
.. _jbig2-lossy:
Lossy mode JBIG2
----------------
================
OCRmyPDF provides lossy mode JBIG2 as an advanced feature. Users should `review the technical concerns with JBIG2 in lossy mode <https://abbyy.technology/en:kb:tip:jbig2_compression_and_ocr>`_ and decide if this feature is acceptable for their use case.
OCRmyPDF provides lossy mode JBIG2 as an advanced feature. Users should
`review the technical concerns with JBIG2 in lossy
mode <https://abbyy.technology/en:kb:tip:jbig2_compression_and_ocr>`__
and decide if this feature is acceptable for their use case.
JBIG2 lossy mode does achieve higher compression ratios than any other monochrome (bitonal) compression technology; for large text documents the savings are considerable. JBIG2 lossless still gives great compression ratios and is a major improvement over the older CCITT G4 standard. As explained above, there is some risk of substitution errors.
JBIG2 lossy mode does achieve higher compression ratios than any other
monochrome (bitonal) compression technology; for large text documents
the savings are considerable. JBIG2 lossless still gives great
compression ratios and is a major improvement over the older CCITT G4
standard. As explained above, there is some risk of substitution errors.
To turn on JBIG2 lossy mode, add the argument ``--jbig2-lossy``. ``--optimize {1,2,3}`` are necessary for the argument to take effect also required. Also, a JBIG2 encoder must be installed as described in the previous section.
To turn on JBIG2 lossy mode, add the argument ``--jbig2-lossy``.
``--optimize {1,2,3}`` are necessary for the argument to take effect
also required. Also, a JBIG2 encoder must be installed as described in
the previous section.
*Due to an oversight, ocrmypdf v7.0 and v7.1 used lossy mode by default.*
*Due to an oversight, ocrmypdf v7.0 and v7.1 used lossy mode by
default.*

View File

@@ -1,16 +1,29 @@
.. _lang-packs:
====================================
Installing additional language packs
====================================
OCRmyPDF uses Tesseract for OCR, and relies on its language packs for languages other than English.
OCRmyPDF uses Tesseract for OCR, and relies on its language packs for all languages.
On most platforms, English is installed with Tesseract by default, but not always.
Tesseract supports `most languages <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>`_.
Tesseract supports `most
languages <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>`__.
Languages are identified by standardized three-letter codes (called ISO 639-2 Alpha-3).
Tesseract's documentation also lists the three-letter code for your language.
Some are anglicized, e.g. Spanish is ``spa`` rather than ``esp``, while others
are not, e.g. German is ``deu`` and French is ``fra``.
For Linux users, you can often find packages that provide language packs:
After you have installed a language pack, you can use it with ``ocrmypdf -l <language>``,
for example ``ocrmypdf -l spa``. For multilingual documents, you can specify
all languages to be expected, e.g. ``ocrmypdf -l eng+fra`` for English and French.
English is assumed by default unless other language(s) are specified.
For Linux users, you can often find packages that provide language
packs:
Debian and Ubuntu users
-----------------------
=======================
.. code-block:: bash
@@ -20,11 +33,13 @@ Debian and Ubuntu users
# Install Chinese Simplified language pack
apt-get install tesseract-ocr-chi-sim
You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple
languages can be requested using either ``-l eng+fre`` (English and French) or ``-l eng -l fre``.
You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as
to what languages it should search for. Multiple languages can be
requested using either ``-l eng+fra`` (English and French) or
``-l eng -l fra``.
Fedora users
------------
============
.. code-block:: bash
@@ -34,16 +49,28 @@ Fedora users
# Install Chinese Simplified language pack
dnf install tesseract-langpack-chi_sim
You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as to
what languages it should search for. Multiple languages can be requested using
either ``-l eng+fre`` (English and French) or ``-l eng -l fre``.
You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as
to what languages it should search for. Multiple languages can be
requested using either ``-l eng+fra`` (English and French) or
``-l eng -l fra``.
macOS users
-----------
===========
You can install additional language packs by :ref:`installing Tesseract using Homebrew with all language packs <macos-all-languages>`.
You can install additional language packs by
:ref:`installing Tesseract using Homebrew with all language packs <macos-all-languages>`.
Docker users
------------
============
Users of the OCRmyPDF Docker image should install language packs into a derived Docker image as :ref:`described in that section <docker-lang-packs>`.
Users of the OCRmyPDF Docker image should install language packs into a
derived Docker image as
:ref:`described in that section <docker-lang-packs>`.
Windows users
=============
The Tesseract installer provided by Chocolatey currently includes only English language.
To install other languages, download the respective language pack (``.traineddata`` file)
from https://github.com/tesseract-ocr/tessdata/ and place it in
``C:\\Program Files\\Tesseract-OCR\\tessdata`` (or wherever Tesseract OCR is installed).

75
docs/optimizer.rst Normal file
View File

@@ -0,0 +1,75 @@
================
PDF optimization
================
OCRmyPDF includes an image-oriented PDF optimizer. By default, the optimizer
runs with safe settings with the goal of improving compression at no loss of
quality. At higher optimization levels, lossy optimizations may be applied and
tuned. Optimization occurs after OCR, and only if OCR succeeded. It does not
perform other possible optimizations such as deduplicating resources,
consolidating fonts, simplifying vector drawings, or anything of that nature.
Optimization ranges from ``-O0`` through ``-O3``, where ``0`` disables
optimization and ``3`` implements all options. ``1``, the default, performs only
safe and lossless optimizations. (This is similar to GCC's optimization
parameter.) The exact type of optimizations performed will vary over time.
PDF optimization requires third-party, optional tools for certain optimizations.
If these are not installed or cannot be found by OCRmyPDF, optimization will not
be as good.
Optimizations that always occurs
================================
OCRmyPDF will automatically replace obsolete or inferior compression schemes
such as RLE or LZW with superior schemes such as Deflate and converting
monochrome images to CCITT G4. Since this is harmless it always occurs and there
is no way to disable it. Other non-image compressed objects are compressed as
well.
Fast web view
=============
OCRmyPDF automatically optimizes PDFs for "fast web view" in Adobe Acrobat's
parlance, or equivalently, linearizes PDFs so that the resources they reference
are presented in the order a viewer needs them for sequential display. This
reduces the latency of viewing a PDF both online and from local storage. This
actually slightly increases the file size.
To disable this optimization and all others, use ``ocrmypdf --optimize 0 ...``
or the shorthand ``-O0``.
Lossless optimizations
======================
At optimization level ``-O1`` (the default), OCRmyPDF will also attempt lossless
image optimization.
If a JBIG2 encoder is available, then monochrome images will be converted to
JBIG2, with the potential for huge savings on large black and white images,
since JBIG2 is far more efficient than any other monochrome (bi-level)
compression. (All known US patents related to JBIG2 have probably expired, but
it remains the responsibility of the user to supply a JBIG2 encoder such as
`jbig2enc <https://github.com/agl/jbig2enc>`__. OCRmyPDF does not implement
JBIG2 encoding on its own.)
OCRmyPDF currently does not attempt to recompress losslessly compressed objects
more aggressively.
Lossy optimizations
===================
At optimization level ``-O2`` and ``-O3``, OCRmyPDF will some attempt lossy
image optimization.
If ``pngquant`` is installed, OCRmyPDF will use it to perform quantize paletted
images to reduce their size.
The quality of JPEGs may be lowered, on the assumption that a lower quality
image may be suitable for storage after OCR.
It is not possible to optimize all image types. Uncommon image types may be
skipped by the optimizer.
OCRmyPDF provides :ref:`lossy mode JBIG2 <jbig2-lossy>` as an advanced feature
that additional requires the argument ``--jbig2-lossy``.

161
docs/pdfsecurity.rst Normal file
View File

@@ -0,0 +1,161 @@
===================
PDF security issues
===================
OCRmyPDF should only be used on PDFs you trust. It is not designed to
protect you against malware.
Recognizing that many users have an interest in handling PDFs and
applying OCR to PDFs they did not generate themselves, this article
discusses the security implications of PDFs and how users can protect
themselves.
The disclaimer applies: this software has no warranties of any kind.
PDFs may contain malware
========================
PDF is a rich, complex file format. The official PDF 1.7 specification,
ISO 32000:2008, is hundreds of pages long and references several annexes
each of which are similar in length. PDFs can contain video, audio, XML,
JavaScript and other programming, and forms. In some cases, they can
open internet connections to pre-selected URLs. All of these possible
attack vectors.
In short, PDFs `may contain
viruses <https://security.stackexchange.com/questions/64052/can-a-pdf-file-contain-a-virus>`__.
This
`article <https://theinvisiblethings.blogspot.ca/2013/02/converting-untrusted-pdfs-into-trusted.html>`__
describes a high-paranoia method which allows potentially hostile PDFs
to be viewed and rasterized safely in a disposable virtual machine. A
trusted PDF created in this manner is converted to images and loses all
information making it searchable and losing all compression. OCRmyPDF
could be used restore searchability.
How OCRmyPDF processes PDFs
===========================
OCRmyPDF must open and interpret your PDF in order to insert an OCR
layer. First, it runs all PDFs through
`pikepdf <https://github.com/pikepdf/pikepdf>`__, a library based on
`qpdf <https://github.com/qpdf/qpdf>`__, a program that repairs PDFs
with syntax errors. This is done because, in the author's experience, a
significant number of PDFs in the wild especially those created by
scanners are not well-formed files. qpdf makes it more likely that
OCRmyPDF will succeed, but offers no security guarantees. qpdf is also
used to split the PDF into single page PDFs.
Finally, OCRmyPDF rasterizes each page of the PDF using
`Ghostscript <http://ghostscript.com/>`__ in ``-dSAFER`` mode.
Depending on the options specified, OCRmyPDF may graft the OCR layer
into the existing PDF or it may essentially reconstruct ("re-fry") a
visually identical PDF that may be quite different at the binary level.
That said, OCRmyPDF is not a tool designed for sanitizing PDFs.
.. _ocr-service:
Using OCRmyPDF online or as a service
=====================================
OCRmyPDF is not designed for use as a public web service where a
malicious user could upload a chosen PDF. In particular, it is not
necessarily secure against PDF malware or PDFs that cause denial of
service. OCRmyPDF relies on Ghostscript, and therefore, if deployed
online one should be prepared to comply with Ghostscript's Affero GPL
license, and any other licenses.
Setting aside these concerns, a side effect of OCRmyPDF is it may
incidentally sanitize PDFs that contain certain types of malware. It
repairs the PDF with pikepdf/libqpdf, which could correct malformed PDF
structures that are part of an attack. When PDF/A output is selected
(the default), the input PDF is partially reconstructed by Ghostscript.
When ``--force-ocr`` is used, all pages are rasterized and reconverted
to PDF, which could remove malware in embedded images.
OCRmyPDF should be relatively safe to use in a trusted intranet, with
some considerations:
Limiting CPU usage
------------------
OCRmyPDF will attempt to use all available CPUs and storage, so
executing ``nice ocrmypdf`` or limiting the number of jobs with the
``-j`` argument may ensure the server remains available. Another option
would be run OCRmyPDF jobs inside a Docker container, a virtual machine,
or a cloud instance, which can impose its own limits on CPU usage and be
terminated "from orbit" if it fails to complete.
Temporary storage requirements
------------------------------
OCRmyPDF will use a large amount of temporary storage for its work,
proportional to the total number of pixels needed to rasterize the PDF.
The raster image of a 8.5×11" color page at 300 DPI takes 25 MB
uncompressed; OCRmyPDF saves its intermediates as PNG, but that still
means it requires about 9 MB per intermediate based on average
compression ratios. Multiple intermediates per page are also required,
depending on the command line given. A rule of thumb would be to allow
100 MB of temporary storage per page in a file meaning that a small
cloud servers or small VM partitions should be provisioned with plenty
of extra space, if say, a 500 page file might be sent.
To check temporary storage usage on actual files, run
``ocrmypdf -k ...`` which will preserve and print the path to temporary
storage when the job is done.
To change where temporary files are stored, change the ``TMPDIR``
environment variable for ocrmypdf's environment. (Python's
``tempfile.gettempdir()`` returns the root directory in which temporary
files will be stored.) For example, one could redirect ``TMPDIR`` to a
large RAM disk to avoid wear on HDD/SSD and potentially improve
performance. On Amazon Web Services, ``TMPDIR`` can be set to `empheral
storage <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html>`__.
Timeouts
--------
To prevent excessively long OCR jobs consider setting
``--tesseract-timeout`` and/or ``--skip-big`` arguments. ``--skip-big``
is particularly helpful if your PDFs include documents such as reports
on standard page sizes with large images attached - often large images
are not worth OCR'ing anyway.
Commercial alternatives
-----------------------
The author also provides professional services that include OCR and
building databases around PDFs, and is happy to provide consultation.
Abbyy Cloud OCR is a viable commercial alternative with a web services
API.
Password protection, digital signatures and certification
=========================================================
Password protected PDFs usually have two passwords, and owner and user
password. When the user password is set to empty, PDF readers will open
the file automatically and marked it as "(SECURED)". While not as
reliable as a digital signature, this indicates that whoever set the
password approved of the file at that time. When the user password is
set, the document cannot be viewed without the password.
Either way, OCRmyPDF does not remove passwords from PDFs and exits with
an error on encountering them.
``qpdf`` can remove passwords. If the owner and user password are set, a
password is required for ``qpdf``. If only the owner password is set, then the
password can be stripped, even if one does not have the owner password.
After OCR is applied, password protection is not permitted on PDF/A
documents but the file can be converted to regular PDF.
Many programs exist which are capable of inserting an image of someone's
signature. On its own, this offers no security guarantees. It is trivial
to remove the signature image and apply it to other files. This practice
offers no real security.
Important documents can be digitally signed and certified to attest to
their authorship. OCRmyPDF cannot do this. Open source tools such as
pdfbox (Java) have this capability as does Adobe Acrobat.

22
docs/performance.rst Normal file
View File

@@ -0,0 +1,22 @@
===========
Performance
===========
Some users have noticed that current versions of OCRmyPDF do not run as quickly
as some older versions (specifically 6.x and older). This is because OCRmyPDF
added image optimization as a postprocessing step, and it is enabled by default.
Speed
=====
If running OCRmyPDF quickly is your main goal, you can use settings such as:
* ``--optimize 0`` to disable file size optimization
* ``--output-type pdf`` to disable PDF/A generation
* ``--fast-web-view 0`` to disable fast web view optimization
* ``--skip-big`` to skip large images, if some pages have large images
You can also avoid:
* ``--force-ocr``
* Image preprocessing

200
docs/plugins.rst Normal file
View File

@@ -0,0 +1,200 @@
=======
Plugins
=======
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL
NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and
"OPTIONAL" in this document are to be interpreted as described in
RFC 2119.
You can use plugins to customize the behavior of OCRmyPDF at certain points of
interest.
Currently, it is possible to:
- add new command line arguments
- override the decision for whether or not to perform OCR on a particular file
- modify the image is about to be sent for OCR
- modify the page image before it is converted to PDF
- replace the Tesseract OCR with another OCR engine that has similar behavior
- replace Ghostscript with another PDF to image converter (rasterizer) or
PDF/A generator
OCRmyPDF plugins are based on the Python ``pluggy`` package and conform to its
conventions. Note that: plugins installed with as setuptools entrypoints are
not checked currently, because OCRmyPDF assumes you may not want to enable
plugins for all files.
Script plugins
==============
Script plugins may be called from the command line, by specifying the name of a file.
Script plugins may be convenient for informal or "one-off" plugins, when a certain
batch of files needs a special processing step for example.
.. code-block:: bash
ocrmypdf --plugin ocrmypdf_example_plugin.py input.pdf output.pdf
Multiple plugins may be installed by issuing the ``--plugin`` argument multiple times.
Packaged plugins
================
Installed plugins may be installed into the same virtual environment as OCRmyPDF
is installed into. They may be invoked using Python standard module naming.
If you are intending to distribute a plugin, please package it.
.. code-block:: bash
ocrmypdf --plugin ocrmypdf_fancypants.pockets.contents input.pdf output.pdf
OCRmyPDF does not automatically import plugins, because the assumption is that
plugins affect different files differently and you may not want them activated
all the time. The command line or ``ocrmypdf.ocr(plugin='...')`` must call
for them.
Third parties that wish to distribute packages for ocrmypdf should package them
as packaged plugins, and these modules should begin with the name ``ocrmypdf_``
similar to ``pytest`` packages such as ``pytest-cov`` (the package) and
``pytest_cov`` (the module).
.. note::
We strongly recommend plugin authors name their plugins with the prefix
``ocrmypdf-`` (for the package name on PyPI) and ``ocrmypdf_`` (for the
module), just like pytest plugins.
Setuptools plugins
==================
You can also create a plugin that OCRmyPDF will always automatically load if both are
installed in the same virtual environment, using a setuptools entrypoint.
Your package's ``setup.py`` would need to contain the following, for a plugin
named ``ocrmypdf-exampleplugin``:
.. code-block:: python
# sample ./setup.py file
from setuptools import setup
setup(
name="ocrmypdf-exampleplugin",
packages=["exampleplugin"],
# the following makes a plugin available to pytest
entry_points={"ocrmypdf": ["exampleplugin = exampleplugin.pluginmodule"]},
)
Plugin requirements
===================
OCRmyPDF generally uses multiple worker processes. When a new worker is started,
Python will import all plugins again, including all plugins that were imported earlier.
This means that the global state of a plugin in one worker will not be shared with
other workers. As such, plugin hook implementations should be stateless, relying
only on their inputs. Hook implementations may use their input parameters to
to obtain a reference to shared state prepared by another hook implementation.
Plugins must expect that other instances of the plugin will be running
simultaneously.
The ``context`` object that is passed to many hooks can be used to share information
about a file being worked on. Plugins must write private, plugin-specific data to
a subfolder named ``{options.work_folder}/ocrmypdf-plugin-name``. Plugins MAY
read and write files in ``options.work_folder``, but should be aware that their
semantics are subject to change.
OCRmyPDF will delete ``options.work_folder`` when it has finished OCRing
a file, unless invoked with ``--keep-temporary-files``.
The documentation for some plugin hooks contain a detailed description of the
execution context in which they will be called.
Plugins should be prepared to work whether executed in worker threads or worker
processes. Generally, OCRmyPDF uses processes, but has a semi-hidden threaded
argument that simplifies debugging.
Plugin hooks
============
A plugin may provide the following hooks. Hooks must be decorated with
``ocrmypdf.hookimpl``, for example:
.. code-block:: python
from ocrmpydf import hookimpl
@hookimpl
def add_options(parser):
pass
The following is a complete list of hooks that are available, and when
they are called.
.. _firstresult:
**Note on firstresult hooks**
If multiple plugins install implementations for this hook, they will be called in
the reverse of the order in which they are installed (i.e., last plugin wins).
When each hook implementation is called in order, the first implementation that
returns a value other than ``None`` will "win" and prevent execution of all other
hooks. As such, you cannot "chain" a series of plugin filters together in this
way. Instead, a single hook implementation should be responsible for any such
chaining operations.
Custom command line arguments
-----------------------------
.. autofunction:: ocrmypdf.pluginspec.add_options
.. autofunction:: ocrmypdf.pluginspec.check_options
Execution and progress reporting
--------------------------------
.. autoclass: ocrmypdf.pluginspec.Executor
:members:
.. autofunction:: ocrmypdf.pluginspec.get_logging_console
.. autofunction:: ocrmypdf.pluginspec.get_executor
.. autofunction:: ocrmypdf.pluginspec.get_progressbar_class
Applying special behavior before processing
-------------------------------------------
.. autofunction:: ocrmypdf.pluginspec.validate
PDF page to image
-----------------
.. autofunction:: ocrmypdf.pluginspec.rasterize_pdf_page
Modifying intermediate images
-----------------------------
.. autofunction:: ocrmypdf.pluginspec.filter_ocr_image
.. autofunction:: ocrmypdf.pluginspec.filter_page_image
.. autofunction:: ocrmypdf.pluginspec.filter_pdf_page
OCR engine
----------
.. autofunction:: ocrmypdf.pluginspec.get_ocr_engine
.. autoclass:: ocrmypdf.pluginspec.OcrEngine
:members:
.. automethod:: __str__
.. autoclass:: ocrmypdf.pluginspec.OrientationConfidence
PDF/A production
----------------
.. autofunction:: ocrmypdf.pluginspec.generate_pdfa

View File

File diff suppressed because it is too large Load Diff

View File

@@ -1,79 +0,0 @@
PDF security issues
===================
OCRmyPDF should only be used on PDFs you trust. It is not designed to protect you against malware.
Recognizing that many users have an interest in handling PDFs and applying OCR to PDFs they did not generate themselves, this article discusses the security implications of PDFs and how users can protect themselves.
The disclaimer applies: this software has no warranties of any kind.
PDFs may contain malware
------------------------
PDF is a rich, complex file format. The official PDF 1.7 specification, ISO 32000:2008, is hundreds of pages long and references several annexes each of which are similar in length. PDFs can contain video, audio, XML, JavaScript and other programming, and forms. In some cases, they can open internet connections to pre-selected URLs. All of these possible attack vectors.
In short, PDFs `may contain viruses <https://security.stackexchange.com/questions/64052/can-a-pdf-file-contain-a-virus>`_.
This `article <https://theinvisiblethings.blogspot.ca/2013/02/converting-untrusted-pdfs-into-trusted.html>`_ describes a high-paranoia method which allows potentially hostile PDFs to be viewed and rasterized safely in a disposable virtual machine. A trusted PDF created in this manner is converted to images and loses all information making it searchable and losing all compression. OCRmyPDF could be used restore searchability.
How OCRmyPDF processes PDFs
---------------------------
OCRmyPDF must open and interpret your PDF in order to insert an OCR layer. First, it runs all PDFs through `pikepdf <https://github.com/pikepdf/pikepdf>`_, a library based on `qpdf <https://github.com/qpdf/qpdf>`_, a program that repairs PDFs with syntax errors. This is done because, in the author's experience, a significant number of PDFs in the wild especially those created by scanners are not well-formed files. qpdf makes it more likely that OCRmyPDF will succeed, but offers no security guarantees. qpdf is also used to split the PDF into single page PDFs.
Finally, OCRmyPDF rasterizes each page of the PDF using `Ghostscript <http://ghostscript.com/>`_ in ``-dSAFER`` mode.
Depending on the options specified, OCRmyPDF may graft the OCR layer into the existing PDF or it may essentially reconstruct ("re-fry") a visually identical PDF that may be quite different at the binary level. That said, OCRmyPDF is not a tool designed for sanitizing PDFs.
.. _ocr-service:
Using OCRmyPDF online or as a service
-------------------------------------
OCRmyPDF is not designed for use as a public web service where a malicious user could upload a chosen PDF. In particular, it is not necessarily secure against PDF malware or PDFs that cause denial of service. OCRmyPDF relies on Ghostscript, and therefore, if deployed online one should be prepared to comply with Ghostscript's Affero GPL license, OCRmyPDF's GPL license, and any other licenses.
Setting aside these concerns, a side effect of OCRmyPDF is it may incidentally sanitize PDFs that contain certain types of malware. It runs ``qpdf`` to repair the PDF, which could correct malformed PDF structures that are part of an attack. When PDF/A output is selected (the default), the input PDF is partially reconstructed by Ghostscript. When ``--force-ocr`` is used, all pages are rasterized and reconverted to PDF, which could remove malware in embedded images.
OCRmyPDF should be relatively safe to use in a trusted intranet, with some considerations:
Limiting CPU usage
^^^^^^^^^^^^^^^^^^
OCRmyPDF will attempt to use all available CPUs and storage, so executing ``nice ocrmypdf`` or limiting the number of jobs with the ``-j`` argument may ensure the server remains available. Another option would be run OCRmyPDF jobs inside a Docker container, a virtual machine, or a cloud instance, which can impose its own limits on CPU usage and be terminated "from orbit" if it fails to complete.
Temporary storage requirements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
OCRmyPDF will use a large amount of temporary storage for its work, proportional to the total number of pixels needed to rasterize the PDF. The raster image of a 8.5×11" color page at 300 DPI takes 25 MB uncompressed; OCRmyPDF saves its intermediates as PNG, but that still means it requires about 9 MB per intermediate based on average compression ratios. Multiple intermediates per page are also required, depending on the command line given. A rule of thumb would be to allow 100 MB of temporary storage per page in a file meaning that a small cloud servers or small VM partitions should be provisioned with plenty of extra space, if say, a 500 page file might be sent.
To check temporary storage usage on actual files, run ``ocrmypdf -k ...`` which will preserve and print the path to temporary storage when the job is done.
To change where temporary files are stored, change the ``TMPDIR`` environment variable for ocrmypdf's environment. (Python's ``tempfile.gettempdir()`` returns the root directory in which temporary files will be stored.) For example, one could redirect ``TMPDIR`` to a large RAM disk to avoid wear on HDD/SSD and potentially improve performance. On Amazon Web Services, ``TMPDIR`` can be set to `empheral storage <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html>`_.
Timeouts
^^^^^^^^
To prevent excessively long OCR jobs consider setting ``--tesseract-timeout`` and/or ``--skip-big`` arguments. ``--skip-big`` is particularly helpful if your PDFs include documents such as reports on standard page sizes with large images attached - often large images are not worth OCR'ing anyway.
Commercial alternatives
^^^^^^^^^^^^^^^^^^^^^^^
The author also provides professional services that include OCR and building databases around PDFs, and is happy to provide consultation.
Abbyy Cloud OCR is a viable commercial alternative with a web services API.
Password protection, digital signatures and certification
---------------------------------------------------------
Password protected PDFs usually have two passwords, and owner and user password. When the user password is set to empty, PDF readers will open the file automatically and marked it as "(SECURED)". While not as reliable as a digital signature, this indicates that whoever set the password approved of the file at that time. When the user password is set, the document cannot be viewed without the password.
Either way, OCRmyPDF does not remove passwords from PDFs and exits with an error on encountering them.
``qpdf``, one of OCRmyPDF's dependencies, can remove passwords. If the owner and user password are set, a password is required for ``qpdf``. If only the owner password is set, then the password can be stripped, even if one does not have the owner password.
After OCR is applied, password protection is not permitted on PDF/A documents but the file can be converted to regular PDF.
Many programs exist which are capable of inserting an image of someone's signature. On its own, this offers no security guarantees. It is trivial to remove the signature image and apply it to other files. This practice offers no real security.
Important documents can be digitally signed and certified to attest to their authorship. OCRmyPDF cannot do this. Open source tools such as pdfbox (Java) have this capability as does Adobe Acrobat.

68
misc/batch.py Normal file
View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python3
# Copyright 2016 findingorder: https://github.com/findingorder
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# This script must be edited to meet your needs.
import logging
import os
import sys
import ocrmypdf
# pylint: disable=logging-format-interpolation
# pylint: disable=logging-not-lazy
script_dir = os.path.dirname(os.path.realpath(__file__))
print(script_dir + '/batch.py: Start')
if len(sys.argv) > 1:
start_dir = sys.argv[1]
else:
start_dir = '.'
if len(sys.argv) > 2:
log_file = sys.argv[2]
else:
log_file = script_dir + '/ocr-tree.log'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(message)s',
filename=log_file,
filemode='w',
)
ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)
for dir_name, subdirs, file_list in os.walk(start_dir):
logging.info(dir_name + '\n')
os.chdir(dir_name)
for filename in file_list:
file_ext = os.path.splitext(filename)[1]
if file_ext == '.pdf':
full_path = dir_name + '/' + filename
print(full_path)
result = ocrmypdf.ocr(filename, filename, deskew=True)
if result == ocrmypdf.ExitCode.already_done_ocr:
print("Skipped document because it already contained text")
elif result == ocrmypdf.ExitCode.ok:
print("OCR complete")
logging.info(result)

View File

@@ -1,9 +1,58 @@
# ocrmypdf completion -*- shell-script -*-
# Copyright 2019 Frank Pille
# Copyright 2020 Alex Willner
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
set -o errexit
_ocrmypdf()
{
local cur prev cword words split
_init_completion -s || return
# Homebrew on Macs have version 1.3 of bash-completion which doesn't include - see #502
if declare -F _init_completions >/dev/null 2>&1; then
_init_completion -s || return
else
COMPREPLY=()
_get_comp_words_by_ref cur prev words cword
fi
if [[ $cur == -* ]]; then
COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
--sidecar --version --jobs --quiet --verbose --title --author
--subject --keywords --rotate-pages --remove-background --deskew
--clean --clean-final --unpaper-args --oversample --remove-vectors
--threshold --force-ocr --skip-text --redo-ocr
--skip-big --jpeg-quality --png-quality --jbig2-lossy
--max-image-mpixels --tesseract-config --tesseract-pagesegmode
--help --tesseract-oem --pdf-renderer --tesseract-timeout
--rotate-pages-threshold --pdfa-image-compression --user-words
--user-patterns --keep-temporary-files --output-type
--no-progress-bar --pages --fast-web-view' \
-- "$cur" ) )
return
else
_filedir
return
fi
case $prev in
--version|-h|--help)
@@ -49,39 +98,23 @@ _ocrmypdf()
return
;;
-v|--verbose)
COMPREPLY=( $( compgen -W '{1..9}' -- "$cur" ) ) # max level ?
COMPREPLY=( $( compgen -W '{0..2}' -- "$cur" ) ) # max level ?
return
;;
--tesseract-pagesegmode)
COMPREPLY=( $( compgen -W '{1..13}' -- "$cur" ) )
return
;;
--sidecar|--title|--author|--subject|--keywords|--unpaper-args)
--sidecar|--title|--author|--subject|--keywords|--unpaper-args|--pages|--fast-web-view)
# argument required but no completions available
return
;;
esac
$split && return
if [[ $cur == -* ]]; then
COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
--sidecar --version --jobs --quiet --verbose --title --author
--subject --keywords --rotate-pages --remove-background --deskew
--clean --clean-final --unpaper-args --oversample --remove-vectors
--mask-barcodes --threshold --force-ocr --skip-text --redo-ocr
--skip-big --jpeg-quality --png-quality --jbig2-lossy
--max-image-mpixels --tesseract-config --tesseract-pagesegmode
--help --tesseract-oem --pdf-renderer --tesseract-timeout
--rotate-pages-threshold --pdfa-image-compression --user-words
--user-patterns --keep-temporary-files --flowchart --output-type' \
-- "$cur" ) )
return
else
_filedir
return
fi
} &&
complete -F _ocrmypdf ocrmypdf
set +o errexit
# ex: filetype=sh

View File

@@ -1,15 +1,34 @@
complete -c ocrmypdf -l version
complete -c ocrmypdf -l help
# Copyright 2020 James R. Barlow
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
complete -c ocrmypdf -l sidecar -r -d "write OCR to text file"
complete -c ocrmypdf -s q -l quiet
complete -c ocrmypdf -x -n '__fish_is_first_arg' -l version
complete -c ocrmypdf -x -n '__fish_is_first_arg' -s h -s "?" -l help
complete -c ocrmypdf -r -l sidecar -d "write OCR to text file"
complete -c ocrmypdf -x -s q -l quiet
complete -c ocrmypdf -s r -l rotate-pages -d "rotate pages to correct orientation"
complete -c ocrmypdf -s d -l deskew -d "fix small horizontal alignment skew"
complete -c ocrmypdf -s c -l clean -d "clean document images before OCR"
complete -c ocrmypdf -s i -l clean-final -d "clean document images and keep result"
complete -c ocrmypdf -l remove-vectors -d "don't send vector objects to OCR"
complete -c ocrmypdf -l mask-barcodes -d "mask barcodes from OCR"
complete -c ocrmypdf -l threshold -d "threshold images before OCR"
complete -c ocrmypdf -s f -l force-ocr -d "OCR documents that already have printable text"
@@ -18,8 +37,14 @@ complete -c ocrmypdf -l redo-ocr -d "redo OCR on any pages that seem to have OCR
complete -c ocrmypdf -s k -l keep-temporary-files -d "keep temporary files (debug)"
complete -c ocrmypdf -x -s l -l language -d 'language'
complete -c ocrmypdf -x -s l -l language -a '(tesseract --list-langs)'
function __fish_ocrmypdf_languages
set langs (tesseract --list-langs ^/dev/null)
set arr (string split '\n' $langs)
for lang in $arr[2..-1]
echo $lang
end
end
complete -c ocrmypdf -x -s l -l language -a '(__fish_ocrmypdf_languages)' -d "language"
complete -c ocrmypdf -x -l image-dpi -d "assume this DPI if input image DPI is unknown"
@@ -34,10 +59,11 @@ complete -c ocrmypdf -x -l output-type -a '(__fish_ocrmypdf_output_type)' -d "se
function __fish_ocrmypdf_pdf_renderer
echo -e "auto\t"(_ "auto select PDF renderer")
echo -e "hocr\t"(_ "use hocr renderer")
echo -e "hocr\t"(_ "use hOCR renderer")
echo -e "hocrdebug\t"(_ "uses hOCR renderer in debug mode, showing recognized text")
echo -e "sandwich\t"(_ "use sandwich renderer")
end
complete -c ocrmypdf -x -l pdf-render -a '(__fish_ocrmypdf_pdf_renderer)' -d "select PDF renderer options"
complete -c ocrmypdf -x -l pdf-renderer -a '(__fish_ocrmypdf_pdf_renderer)' -d "select PDF renderer options"
function __fish_ocrmypdf_optimize
echo -e "0\t"(_ "do not optimize")
@@ -47,8 +73,23 @@ function __fish_ocrmypdf_optimize
end
complete -c ocrmypdf -x -s O -l optimize -a '(__fish_ocrmypdf_optimize)' -d "select optimization level"
function __fish_ocrmypdf_verbose
echo -e "0\t"(_ "standard output messages")
echo -e "1\t"(_ "troubleshooting output messages")
echo -e "2\t"(_ "debugging output messages")
end
complete -c ocrmypdf -x -s v -l verbose -a '(__fish_ocrmypdf_verbose)' -d "set verbosity level"
complete -c ocrmypdf -x -l no-progress-bar -d "disable the progress bar"
function __fish_ocrmypdf_pdfa_compression
echo -e "auto\t"(_ "let Ghostscript decide how to compress images")
echo -e "jpeg\t"(_ "convert color and grayscale images to JPEG")
echo -e "lossless\t"(_ "convert color and grayscale images to lossless (PNG)")
end
complete -c ocrmypdf -x -l pdfa-image-compression -a '(__fish_ocrmypdf_pdfa_compression)' -d "set PDF/A image compression options"
complete -c ocrmypdf -x -s j -l jobs -d "how many worker processes to use"
complete -c ocrmypdf -x -s v -a '(seq 1 9)'
complete -c ocrmypdf -x -l title -d "set metadata"
complete -c ocrmypdf -x -l author -d "set metadata"
complete -c ocrmypdf -x -l subject -d "set metadata"
@@ -60,11 +101,39 @@ complete -c ocrmypdf -x -l jpeg-quality -d "JPEG quality [0..100]"
complete -c ocrmypdf -x -l png-quality -d "PNG quality [0..100]"
complete -c ocrmypdf -x -l jbig2-lossy -d "enable lossy JBIG2 (see docs)"
complete -c ocrmypdf -x -l max-image-mpixels -d "image decompression bomb threshold"
complete -c ocrmypdf -x -l pages -d "apply OCR to only the specified pages"
complete -c ocrmypdf -x -l tesseract-config -d "set custom tesseract config file"
complete -c ocrmypdf -x -l tesseract-pagesegmode -d "set tesseract --psm"
complete -c ocrmypdf -x -l tesseract-oem -d "set tesseract --oem"
function __fish_ocrmypdf_tesseract_pagesegmode
echo -e "0\t"(_ "orientation and script detection (OSD) only")
echo -e "1\t"(_ "automatic page segmentation with OSD")
echo -e "2\t"(_ "automatic page segmentation, but no OSD, or OCR")
echo -e "3\t"(_ "fully automatic page segmentation, but no OSD (default)")
echo -e "4\t"(_ "assume a single column of text of variable sizes")
echo -e "5\t"(_ "assume a single uniform block of vertically aligned text")
echo -e "6\t"(_ "assume a single uniform block of text")
echo -e "7\t"(_ "treat the image as a single text line")
echo -e "8\t"(_ "treat the image as a single word")
echo -e "9\t"(_ "treat the image as a single word in a circle")
echo -e "10\t"(_ "treat the image as a single character")
echo -e "11\t"(_ "sparse text - find as much text as possible in no particular order")
echo -e "12\t"(_ "sparse text with OSD")
echo -e "13\t"(_ "raw line - treat the image as a single text line")
end
complete -c ocrmypdf -x -l tesseract-pagesegmode -a '(__fish_ocrmypdf_tesseract_pagesegmode)' -d "set tesseract --psm"
function __fish_ocrmypdf_tesseract_oem
echo -e "0\t"(_ "legacy engine only")
echo -e "1\t"(_ "neural nets LSTM engine only")
echo -e "2\t"(_ "legacy + LSTM engines")
echo -e "3\t"(_ "default, based on what is available")
end
complete -c ocrmypdf -x -l tesseract-oem -a '(__fish_ocrmypdf_tesseract_oem)' -d "set tesseract --oem"
complete -c ocrmypdf -x -l tesseract-timeout -d "maximum number of seconds to wait for OCR"
complete -c ocrmypdf -x -l rotate-pages-threshold -d "page rotation confidence"
complete -c ocrmypdf -x -l pdfa-image-compression -a 'auto jpeg lossless' -d "set PDF/A image compression options"
complete -c ocrmypdf -x -a "(__fish_complete_suffix .pdf)"
complete -c ocrmypdf -r -l user-words -d "specify location of user words file"
complete -c ocrmypdf -r -l user-patterns -d "specify location of user patterns file"
complete -c ocrmypdf -x -l fast-web-view -d "if file size if above this amount in MB, linearize PDF"
complete -c ocrmypdf -x -a "(__fish_complete_suffix .pdf; __fish_complete_suffix .PDF; __fish_complete_suffix .jpg; __fish_complete_suffix .png)"

View File

@@ -0,0 +1,15 @@
---
version: "3.3"
services:
ocrmypdf:
restart: always
container_name: ocrmypdf
image: jbarlow83/ocrmypdf
volumes:
- "/media/scan:/input"
- "/mnt/scan:/output"
environment:
- OCR_OUTPUT_DIRECTORY_YEAR_MONTH=0
user: "<SET TO YOUR USER ID>:<SET TO YOUR GROUP ID>"
entrypoint: python3
command: watcher.py

84
misc/example_plugin.py Normal file
View File

@@ -0,0 +1,84 @@
# © 2020 James R Barlow: https://github.com/jbarlow83
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
An example of an OCRmyPDF plugin.
This plugin adds two new command line arguments
--grayscale-ocr: converts the image to grayscale before performing OCR on it
(This is occasionally useful for images whose color confounds OCR. It only
affects the image shown to OCR. The image is not saved.)
--mono-page: converts pages all pages in the output file to black and white
To use this from the command line:
ocrmypdf --plugin path/to/example_plugin.py --mono-page input.pdf output.pdf
To use this as an API:
import ocrmypdf
ocrmypdf.ocr('input.pdf', 'output.pdf',
plugins=['path/to/example_plugin.py'], mono_page=True
)
"""
import logging
from PIL import Image
from ocrmypdf import hookimpl
log = logging.getLogger(__name__)
@hookimpl
def add_options(parser):
parser.add_argument('--grayscale-ocr', action='store_true')
parser.add_argument('--mono-page', action='store_true')
@hookimpl
def prepare(options):
pass
@hookimpl
def validate(pdfinfo, options):
pass
@hookimpl
def filter_ocr_image(page, image):
if page.options.grayscale_ocr:
log.info("graying")
return image.convert('L')
return image
@hookimpl
def filter_page_image(page, image_filename):
if page.options.mono_page:
with Image.open(image_filename) as im:
im = im.convert('1')
im.save(image_filename)
return image_filename
else:
output = image_filename.with_suffix('.jpg')
with Image.open(image_filename) as im:
im.save(output)
return output

92
misc/synology.py Normal file
View File

@@ -0,0 +1,92 @@
#!/bin/env python3
# Copyright 2017 github.com/Enantiomerie
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# This script must be edited to meet your needs.
import logging
import os
import shutil
import subprocess
import sys
import time
# pylint: disable=logging-format-interpolation
# pylint: disable=logging-not-lazy
script_dir = os.path.dirname(os.path.realpath(__file__))
timestamp = time.strftime("%Y-%m-%d-%H%M_")
log_file = script_dir + '/' + timestamp + 'ocrmypdf.log'
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(message)s',
filename=log_file,
filemode='w',
)
if len(sys.argv) > 1:
start_dir = sys.argv[1]
else:
start_dir = '.'
for dir_name, subdirs, file_list in os.walk(start_dir):
logging.info(dir_name)
os.chdir(dir_name)
for filename in file_list:
file_stem, file_ext = os.path.splitext(filename)
if file_ext != '.pdf':
continue
full_path = os.path.join(dir_name, filename)
timestamp_ocr = time.strftime("%Y-%m-%d-%H%M_OCR_")
filename_ocr = timestamp_ocr + file_stem + '.pdf'
# create string for pdf processing
# the script is processed as root user via chron
cmd = [
'docker',
'run',
'--rm',
'-i',
'jbarlow83/ocrmypdf',
'--deskew',
'-',
'-',
]
logging.info(cmd)
full_path_ocr = os.path.join(dir_name, filename_ocr)
with open(filename, 'rb') as input_file, open(
full_path_ocr, 'wb'
) as output_file:
proc = subprocess.run(
cmd,
stdin=input_file,
stdout=output_file,
stderr=subprocess.PIPE,
check=False,
text=True,
errors='ignore',
)
logging.info(proc.stderr)
os.chmod(full_path_ocr, 0o664)
os.chmod(full_path, 0o664)
full_path_ocr_archive = sys.argv[2]
full_path_archive = sys.argv[2] + '/no_ocr'
shutil.move(full_path_ocr, full_path_ocr_archive)
shutil.move(full_path, full_path_archive)
logging.info('Finished.\n')

166
misc/watcher.py Normal file
View File

@@ -0,0 +1,166 @@
# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander
# Copyright (C) 2020 James R Barlow: https://github.com/jbarlow83
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import json
import logging
import os
import sys
import time
from datetime import datetime
from pathlib import Path
import pikepdf
from watchdog.events import PatternMatchingEventHandler
from watchdog.observers import Observer
from watchdog.observers.polling import PollingObserver
import ocrmypdf
# pylint: disable=logging-format-interpolation
INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', ''))
ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', ''))
DESKEW = bool(os.getenv('OCR_DESKEW', ''))
OCR_JSON_SETTINGS = json.loads(os.getenv('OCR_JSON_SETTINGS', '{}'))
POLL_NEW_FILE_SECONDS = int(os.getenv('OCR_POLL_NEW_FILE_SECONDS', '1'))
USE_POLLING = bool(os.getenv('OCR_USE_POLLING', ''))
LOGLEVEL = os.getenv('OCR_LOGLEVEL', 'INFO')
PATTERNS = ['*.pdf', '*.PDF']
log = logging.getLogger('ocrmypdf-watcher')
def get_output_dir(root, basename):
if OUTPUT_DIRECTORY_YEAR_MONTH:
today = datetime.today()
output_directory_year_month = (
Path(root) / str(today.year) / f'{today.month:02d}'
)
if not output_directory_year_month.exists():
output_directory_year_month.mkdir(parents=True, exist_ok=True)
output_path = Path(output_directory_year_month) / basename
else:
output_path = Path(OUTPUT_DIRECTORY) / basename
return output_path
def wait_for_file_ready(file_path):
# This loop waits to make sure that the file is completely loaded on
# disk before attempting to read. Docker sometimes will publish the
# watchdog event before the file is actually fully on disk, causing
# pikepdf to fail.
retries = 5
while retries:
try:
pdf = pikepdf.open(file_path)
except (FileNotFoundError, pikepdf.PdfError) as e:
log.info(f"File {file_path} is not ready yet")
log.debug("Exception was", exc_info=e)
time.sleep(POLL_NEW_FILE_SECONDS)
retries -= 1
else:
pdf.close()
return True
return False
def execute_ocrmypdf(file_path):
file_path = Path(file_path)
output_path = get_output_dir(OUTPUT_DIRECTORY, file_path.name)
log.info("-" * 20)
log.info(f'New file: {file_path}. Waiting until fully loaded...')
if not wait_for_file_ready(file_path):
log.info(f"Gave up waiting for {file_path} to become ready")
return
log.info(f'Attempting to OCRmyPDF to: {output_path}')
exit_code = ocrmypdf.ocr(
input_file=file_path,
output_file=output_path,
deskew=DESKEW,
**OCR_JSON_SETTINGS,
)
if exit_code == 0 and ON_SUCCESS_DELETE:
log.info(f'OCR is done. Deleting: {file_path}')
file_path.unlink()
else:
log.info('OCR is done')
class HandleObserverEvent(PatternMatchingEventHandler):
def on_any_event(self, event):
if event.event_type in ['created']:
execute_ocrmypdf(event.src_path)
def main():
ocrmypdf.configure_logging(
verbosity=(
ocrmypdf.Verbosity.default
if LOGLEVEL != 'DEBUG'
else ocrmypdf.Verbosity.debug
),
manage_root_logger=True,
)
log.setLevel(LOGLEVEL)
log.info(
f"Starting OCRmyPDF watcher with config:\n"
f"Input Directory: {INPUT_DIRECTORY}\n"
f"Output Directory: {OUTPUT_DIRECTORY}\n"
f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"
)
log.debug(
f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"
f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"
f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"
f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"
f"DESKEW: {DESKEW}\n"
f"ARGS: {OCR_JSON_SETTINGS}\n"
f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"
f"USE_POLLING: {USE_POLLING}\n"
f"LOGLEVEL: {LOGLEVEL}"
)
if 'input_file' in OCR_JSON_SETTINGS or 'output_file' in OCR_JSON_SETTINGS:
log.error('OCR_JSON_SETTINGS should not specify input file or output file')
sys.exit(1)
handler = HandleObserverEvent(patterns=PATTERNS)
if USE_POLLING:
observer = PollingObserver()
else:
observer = Observer()
observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
observer.start()
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
if __name__ == "__main__":
main()

View File

@@ -23,21 +23,22 @@ to emphasize that SaaS deployments should make sure they comply with
Ghostscript's license as well as OCRmyPDF's.
"""
import os
import shlex
from subprocess import PIPE, run
from tempfile import TemporaryDirectory
from flask import (
Flask,
Response,
flash,
request,
redirect,
url_for,
abort,
flash,
redirect,
request,
send_from_directory,
url_for,
)
from subprocess import run, PIPE
from tempfile import TemporaryDirectory
from werkzeug.utils import secure_filename
import os
import shlex
app = Flask(__name__)
app.secret_key = "secret"

View File

@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
[tool.black]
line-length = 88
py36 = true
target-version = ["py36", "py37", "py38"]
skip-string-normalization = true
include = '\.pyi?$'
exclude = '''
@@ -28,5 +28,6 @@ exclude = '''
| docs
| misc
| \.egg-info
| src/ocrmypdf/lib/_leptonica.py
)/
'''

View File

@@ -1,4 +0,0 @@
check-manifest >= 0.35
twine >= 1.8.1
coverage >= 4.5
GitPython == 2.1.3

View File

@@ -1,13 +1,12 @@
# requirements.txt can be used to replicate the developer's build environment
# setup.py lists a separate set of requirements that are looser to simplify
# installation
chardet == 3.0.4
cffi == 1.12.2
img2pdf == 0.3.3
pdfminer.six == 20181108
pikepdf == 1.3.0
Pillow >= 5.0.0, != 5.1.0 ; sys_platform == "darwin"
pycparser == 2.19
python-xmp-toolkit == 2.0.1
reportlab == 3.5.13
ruffus == 2.8.1
cffi == 1.14.5
coloredlogs == 15.0 # technically optional
img2pdf == 0.4.0
pdfminer.six == 20201018
pikepdf == 2.10.0
pluggy == 0.13.1
Pillow == 8.1.2
reportlab == 3.5.66
tqdm == 4.59.0

View File

@@ -1,8 +1,6 @@
pytest >= 4.4.1, < 5
pytest-helpers-namespace >= 2019.1.8
pytest-xdist == 1.28.0
pytest-cov >= 2.6.1
python-xmp-toolkit # requires apt-get install libexempi3
pytest >= 6.0.0
pytest-xdist >= 2.2.0
pytest-cov >= 2.11.1
python-xmp-toolkit == 2.0.1 # requires apt-get install libexempi3
# or brew install exempi
PyPDF2 >= 1.26.0
#PyMuPDF == 1.13.4 # optional

1
requirements/watcher.txt Normal file
View File

@@ -0,0 +1 @@
watchdog == 1.0.2

View File

@@ -0,0 +1 @@
Flask >= 1, < 2

View File

@@ -1,5 +1,5 @@
[bdist_wheel]
python-tag = py35
python-tag = py36
[aliases]
test=pytest
@@ -13,6 +13,10 @@ norecursedirs = lib .pc .git output cache resources
testpaths = tests
filterwarnings =
ignore:.*XMLParser.*:DeprecationWarning
markers =
slow
addopts =
-n auto
[isort]
multi_line_output=3
@@ -20,6 +24,33 @@ include_trailing_comma=True
force_grid_wrap=0
use_parentheses=True
line_length=88
known_first_party = ocrmypdf
known_third_party = PIL,_cffi_backend,cffi,flask,img2pdf,pdfminer,pikepdf,pkg_resources,pluggy,pytest,reportlab,setuptools,sphinx_rtd_theme,tqdm,watchdog,werkzeug
[metadata]
license_file = LICENSE
[coverage:paths]
source =
src/ocrmypdf
[coverage:run]
branch = true
parallel = true
concurrency = multiprocessing
[coverage:report]
# Regexes for lines to exclude from consideration
exclude_lines =
# Have to re-enable the standard pragma
pragma: no cover
# Don't complain if tests don't hit defensive assertion code:
raise AssertionError
raise NotImplementedError
# Don't complain if non-runnable code isn't run:
if 0:
if False:
if __name__ == .__main__.:
if TYPE_CHECKING:

View File

@@ -2,53 +2,21 @@
# -*- coding: utf-8 -*-
# © 2015 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from __future__ import print_function, unicode_literals
import sys
from setuptools import find_packages, setup
if sys.version_info < (3, 6):
print("Python 3.6 or newer is required", file=sys.stderr)
sys.exit(1)
from setuptools import setup, find_packages
from subprocess import STDOUT, check_output, CalledProcessError
from collections.abc import Mapping
import re
# pylint: disable=w0613
command = next((arg for arg in sys.argv[1:] if not arg.startswith('-')), '')
if command.startswith('install') or command in [
'check',
'test',
'nosetests',
'easy_install',
]:
forced = '--force' in sys.argv
if forced:
print("The argument --force is deprecated. Please discontinue use.")
if 'upload' in sys.argv[1:]:
print('Use twine to upload the package - setup.py upload is insecure')
sys.exit(1)
tests_require = open('requirements/test.txt', encoding='utf-8').read().splitlines()
@@ -64,20 +32,23 @@ setup(
long_description_content_type='text/markdown',
url='https://github.com/jbarlow83/OCRmyPDF',
author='James R. Barlow',
author_email='jim@purplerock.ca',
author_email='james@purplerock.ca',
packages=find_packages('src', exclude=["tests", "tests.*"]),
package_dir={'': 'src'},
keywords=['PDF', 'OCR', 'optical character recognition', 'PDF/A', 'scanning'],
classifiers=[
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Development Status :: 5 - Production/Stable",
"Environment :: Console",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Science/Research",
"Intended Audience :: System Administrators",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
"Operating System :: MacOS :: MacOS X",
"Operating System :: Microsoft :: Windows :: Windows 10",
"Operating System :: POSIX",
"Operating System :: POSIX :: BSD",
"Operating System :: POSIX :: Linux",
@@ -88,28 +59,26 @@ setup(
python_requires=' >= 3.6',
setup_requires=[ # can be removed whenever we can drop pip 9 support
'cffi >= 1.9.1', # to build the leptonica module
'pytest-runner', # to enable python setup.py test
'setuptools_scm', # so that version will work
'setuptools_scm_git_archive', # enable version from github tarballs
],
use_scm_version={'version_scheme': 'post-release'},
cffi_modules=['src/ocrmypdf/lib/compile_leptonica.py:ffibuilder'],
install_requires=[
'chardet >= 3.0.4, < 4', # unlisted requirement of pdfminer.six 20181108
'cffi >= 1.9.1', # must be a setup and install requirement
'img2pdf >= 0.3.0, < 0.4', # pure Python, so track HEAD closely
'pdfminer.six == 20181108 ; sys_platform != "darwin"',
'pikepdf >= 1.3.0, < 2',
'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"',
# Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3
# block 5.1.0, broken wheels
'reportlab >= 3.3.0', # oldest released version with sane image handling
'ruffus >= 2.7.0',
'coloredlogs >= 14.0', # strictly optional
'img2pdf >= 0.3.0, < 0.5', # pure Python, so track HEAD closely
'pdfminer.six >= 20191110, != 20200720, <= 20201018',
"pikepdf >= 2.10.0",
'Pillow >= 8.1.2',
'pluggy >= 0.13.0, < 1.0',
'reportlab >= 3.5.66',
'setuptools',
'tqdm >= 4',
],
extras_require={'pdfminer': ['pdfminer.six == 20181108']},
tests_require=tests_require,
entry_points={'console_scripts': ['ocrmypdf = ocrmypdf.__main__:run_pipeline']},
package_data={'ocrmypdf': ['data/sRGB.icc']},
entry_points={'console_scripts': ['ocrmypdf = ocrmypdf.__main__:run']},
package_data={'ocrmypdf': ['data/sRGB.icc', 'py.typed']},
include_package_data=True,
zip_safe=False,
project_urls={

35
src/ocrmypdf/RELEASE.md Normal file
View File

@@ -0,0 +1,35 @@
# Release checklist
## Patch release
- Check `pytest`
- Update release notes
## Minor release
## Major release
- Run `pre-commit autoupdate`
- Check README.md
- Check setup.py
- Are classifiers up to date?
- Is `python_requires` correct?
- Python 3.6 is EOL on December 2021-12. Could drop support then.
- Can we tighten any `install_requires` dependencies?
- Search for old version shims we can remove
- "shim"
- ` pikepdf.__version__`
- Search for deprecation: search all files for deprec*, etc.
- Check requirements/*
- Delete `tests/cache`, do `pytest --runslow`, and update cache.
- Do `pytest --cov-report html`

View File

@@ -1,46 +1,32 @@
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import pkg_resources
PROGRAM_NAME = 'ocrmypdf'
from pluggy import HookimplMarker as _HookimplMarker
# Official PEP 396
__version__ = pkg_resources.get_distribution('ocrmypdf').version
VERSION = __version__
from .exceptions import (
ExitCode,
from ocrmypdf import helpers, hocrtransform, leptonica, pdfa, pdfinfo
from ocrmypdf._concurrent import Executor
from ocrmypdf._jobcontext import PageContext, PdfContext
from ocrmypdf._version import PROGRAM_NAME, __version__
from ocrmypdf.api import Verbosity, configure_logging, ocr
from ocrmypdf.exceptions import (
BadArgsError,
PdfMergeFailedError,
MissingDependencyError,
UnsupportedImageFormatError,
DpiError,
OutputFileAccessError,
PriorOcrFoundError,
InputFileError,
SubprocessOutputError,
EncryptedPdfError,
ExitCode,
ExitCodeException,
InputFileError,
MissingDependencyError,
OutputFileAccessError,
PdfMergeFailedError,
PriorOcrFoundError,
SubprocessOutputError,
TesseractConfigError,
UnsupportedImageFormatError,
)
from ocrmypdf.pluginspec import OcrEngine, OrientationConfidence
from . import helpers
from . import hocrtransform
from . import leptonica
from . import pdfa
from . import pdfinfo
hookimpl = _HookimplMarker('ocrmypdf')

View File

File diff suppressed because it is too large Load Diff

133
src/ocrmypdf/_concurrent.py Normal file
View File

@@ -0,0 +1,133 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import threading
from abc import ABC, abstractmethod
from typing import Callable, Iterable, Optional
def _task_noop(*_args, **_kwargs):
return
class NullProgressBar:
def __init__(self, **kwargs):
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
return False
def update(self, _arg=None):
return
class Executor(ABC):
pool_lock = threading.Lock()
pbar_class = NullProgressBar
def __init__(self, *, pbar_class=None):
if pbar_class:
self.pbar_class = pbar_class
def __call__(
self,
*,
use_threads: bool,
max_workers: int,
tqdm_kwargs: dict,
worker_initializer: Optional[Callable] = None,
task: Optional[Callable] = None,
task_arguments: Optional[Iterable] = None,
task_finished: Optional[Callable] = None,
) -> None:
"""
Set up parallel execution and progress reporting.
Args:
use_threads: If ``False``, the workload is the sort that will benefit from
running in a multiprocessing context (for example, it uses Python
heavily, and parallelizing it with threads is not expected to be
performant).
max_workers: The maximum number of workers that should be run.
tdqm_kwargs: Arguments to set up the progress bar.
worker_initializer: Called when a worker is initialized, in the worker's
execution context. If the child workers are processes, it must be
possible to marshall/pickle the worker initializer.
``functools.partial`` can be used to bind parameters.
task: Called when the worker starts a new task, in the worker's execution
context. Must be possible to marshall to the worker.
task_finished: Called when a worker finishes a task, in the parent's
context.
task_arguments: An iterable that generates a group of parameters for each
task. This runs in the parent's context, but the parameters must be
marshallable to the worker.
"""
if not task_arguments:
return # Nothing to do!
if not worker_initializer:
worker_initializer = _task_noop
if not task_finished:
task_finished = _task_noop
if not task:
task = _task_noop
with self.pool_lock:
self._execute(
use_threads=use_threads,
max_workers=max_workers,
tqdm_kwargs=tqdm_kwargs,
worker_initializer=worker_initializer,
task=task,
task_arguments=task_arguments,
task_finished=task_finished,
)
@abstractmethod
def _execute(
self,
*,
use_threads: bool,
max_workers: int,
tqdm_kwargs: dict,
worker_initializer: Callable,
task: Callable,
task_arguments: Iterable,
task_finished: Callable,
):
"""Custom executors should override this method."""
def setup_executor(plugin_manager) -> Executor:
pbar_class = plugin_manager.hook.get_progressbar_class()
return plugin_manager.hook.get_executor(progressbar_class=pbar_class)
class SerialExecutor(Executor):
"""Implements a purely sequential executor using the parallel protocol.
The current process/thread will be the worker that executes all tasks
in order. As such, ``worker_initializer`` will never be called.
"""
def _execute(
self,
*,
use_threads: bool,
max_workers: int,
tqdm_kwargs: dict,
worker_initializer: Callable,
task: Callable,
task_arguments: Iterable,
task_finished: Callable,
): # pylint: disable=unused-argument
with self.pbar_class(**tqdm_kwargs) as pbar:
for args in task_arguments:
result = task(args)
task_finished(result, pbar)

View File

@@ -0,0 +1,8 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Manage third party executables"""

View File

@@ -0,0 +1,270 @@
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Interface to Ghostscript executable"""
import logging
import os
import re
from io import BytesIO
from os import fspath
from pathlib import Path
from shutil import which
from subprocess import PIPE, CalledProcessError
from typing import Optional
from PIL import Image
from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
from ocrmypdf.helpers import Resolution
from ocrmypdf.subprocess import get_version, run, run_polling_stderr
log = logging.getLogger(__name__)
missing_gs_error = """
---------------------------------------------------------------------
This error normally occurs when ocrmypdf find can't Ghostscript.
Please ensure Ghostscript is installed and its location is added to
the system PATH environment variable.
For details see:
https://ocrmypdf.readthedocs.io/en/latest/installation.html
---------------------------------------------------------------------
"""
_gswin = None
if os.name == 'nt':
_gswin = which('gswin64c')
if not _gswin:
_gswin = which('gswin32c')
if not _gswin:
raise MissingDependencyError(missing_gs_error)
_gswin = Path(_gswin).stem
GS = _gswin if _gswin else 'gs'
del _gswin
def version():
return get_version(GS)
def jpeg_passthrough_available() -> bool:
"""Returns True if the installed version of Ghostscript supports JPEG passthru
Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23
it gained the ability to keep JPEGs unmodified. However, the 9.23
implementation was buggy and would deletes the last two bytes of images in
some cases, as reported here.
https://bugs.ghostscript.com/show_bug.cgi?id=699216
The issue was fixed for 9.24, hence that is the first version we consider
the feature available. (Ghostscript 9.24 has its own problems is blacklisted.)
"""
return version() >= '9.24'
def _gs_error_reported(stream) -> bool:
return True if re.search(r'error', stream, flags=re.IGNORECASE) else False
def rasterize_pdf(
input_file: os.PathLike,
output_file: os.PathLike,
*,
raster_device: str,
raster_dpi: Resolution,
pageno: int = 1,
page_dpi: Optional[Resolution] = None,
rotation: Optional[int] = None,
filter_vector: bool = False,
):
"""Rasterize one page of a PDF at resolution raster_dpi in canvas units."""
raster_dpi = raster_dpi.round(6)
if not page_dpi:
page_dpi = raster_dpi
args_gs = (
[
GS,
'-dQUIET',
'-dSAFER',
'-dBATCH',
'-dNOPAUSE',
f'-sDEVICE={raster_device}',
f'-dFirstPage={pageno}',
f'-dLastPage={pageno}',
f'-r{raster_dpi.x:f}x{raster_dpi.y:f}',
]
+ (['-dFILTERVECTOR'] if filter_vector else [])
+ [
'-o',
'-',
'-sstdout=%stderr',
'-dAutoRotatePages=/None', # Probably has no effect on raster
'-f',
fspath(input_file),
]
)
try:
p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
except CalledProcessError as e:
log.error(e.stderr.decode(errors='replace'))
raise SubprocessOutputError('Ghostscript rasterizing failed')
else:
stderr = p.stderr.decode(errors='replace')
if _gs_error_reported(stderr):
log.error(stderr)
with Image.open(BytesIO(p.stdout)) as im:
if rotation is not None:
log.debug("Rotating output by %i", rotation)
# rotation is a clockwise angle and Image.ROTATE_* is
# counterclockwise so this cancels out the rotation
if rotation == 90:
im = im.transpose(Image.ROTATE_90)
elif rotation == 180:
im = im.transpose(Image.ROTATE_180)
elif rotation == 270:
im = im.transpose(Image.ROTATE_270)
if rotation % 180 == 90:
page_dpi = page_dpi.flip_axis()
im.save(fspath(output_file), dpi=page_dpi)
class GhostscriptFollower:
re_process = re.compile(r"Processing pages \d+ through (\d+).")
re_page = re.compile(r"Page (\d+)")
def __init__(self, progressbar_class):
self.count = 0
self.progressbar_class = progressbar_class
self.progressbar = None
def __call__(self, line):
if not self.progressbar_class:
return
if not self.progressbar:
m = self.re_process.match(line.strip())
if m:
self.count = int(m.group(1))
self.progressbar = self.progressbar_class(
total=self.count, desc="PDF/A conversion", unit='page'
)
return
else:
m = self.re_page.match(line.strip())
if m:
self.progressbar.update()
def generate_pdfa(
pdf_pages,
output_file: os.PathLike,
*,
compression: str,
pdf_version: str = '1.5',
pdfa_part: str = '2',
progressbar_class=None,
):
# Ghostscript's compression is all or nothing. We can either force all images
# to JPEG, force all to Flate/PNG, or let it decide how to encode the images.
# In most case it's best to let it decide.
compression_args = []
if compression == 'jpeg':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/DCTEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/DCTEncode",
]
elif compression == 'lossless':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/FlateEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/FlateEncode",
]
else:
compression_args = [
"-dAutoFilterColorImages=true",
"-dAutoFilterGrayImages=true",
]
strategy = 'LeaveColorUnchanged'
# Older versions of Ghostscript expect a leading slash in
# sColorConversionStrategy, newer ones should not have it. See Ghostscript
# git commit fe1c025d.
strategy = ('/' + strategy) if version() < '9.19' else strategy
if version() == '9.23':
# 9.23: added JPEG passthrough as a new feature, but with a bug that
# incorrectly formats some images. Fixed as of 9.24. So we disable this
# feature for 9.23.
# https://bugs.ghostscript.com/show_bug.cgi?id=699216
compression_args.append('-dPassThroughJPEGImages=false')
# nb no need to specify ProcessColorModel when ColorConversionStrategy
# is set; see:
# https://bugs.ghostscript.com/show_bug.cgi?id=699392
args_gs = (
[
GS,
"-dBATCH",
"-dNOPAUSE",
"-dSAFER",
"-dCompatibilityLevel=" + str(pdf_version),
"-sDEVICE=pdfwrite",
"-dAutoRotatePages=/None",
"-sColorConversionStrategy=" + strategy,
]
+ compression_args
+ [
"-dJPEGQ=95",
"-dPDFA=" + pdfa_part,
"-dPDFACompatibilityPolicy=1",
"-o",
"-",
"-sstdout=%stderr",
]
)
args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs
try:
with Path(output_file).open('wb') as output:
p = run_polling_stderr(
args_gs,
stdout=output,
stderr=PIPE,
check=True,
text=True,
encoding='utf-8',
errors='replace',
callback=GhostscriptFollower(progressbar_class),
)
except CalledProcessError as e:
# Ghostscript does not change return code when it fails to create
# PDF/A - check PDF/A status elsewhere
log.error(e.stderr)
raise SubprocessOutputError('Ghostscript PDF/A rendering failed') from e
else:
stderr = p.stderr
# If there is an error we log the whole stderr, except for filtering
# duplicates.
if _gs_error_reported(stderr):
last_part = None
repcount = 0
for part in stderr.split('****'):
if part != last_part:
if repcount > 1:
log.error(f"(previous error message repeated {repcount} times)")
repcount = 0
log.error(part)
else:
repcount += 1
last_part = part

View File

@@ -1,28 +1,18 @@
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from functools import lru_cache
from subprocess import PIPE, run
from . import get_version
from ..exceptions import MissingDependencyError
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Interface to jbig2 executable"""
from subprocess import PIPE
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import get_version, run
@lru_cache(maxsize=1)
def version():
return get_version('jbig2', regex=r'jbig2enc (\d+(\.\d+)*).*')
@@ -51,9 +41,17 @@ def convert_group(*, cwd, infiles, out_prefix):
return proc
def convert_group_mp(args):
return convert_group(cwd=args[0], infiles=args[1], out_prefix=args[2])
def convert_single(*, cwd, infile, outfile):
args = ['jbig2', '-p', infile]
with open(outfile, 'wb') as fstdout:
proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
proc.check_returncode()
return proc
def convert_single_mp(args):
return convert_single(cwd=args[0], infile=args[1], outfile=args[2])

View File

@@ -0,0 +1,65 @@
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Interface to pngquant executable"""
from contextlib import contextmanager
from io import BytesIO
from pathlib import Path
from subprocess import PIPE
from PIL import Image
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import get_version, run
def version():
return get_version('pngquant', regex=r'(\d+(\.\d+)*).*')
def available():
try:
version()
except MissingDependencyError:
return False
return True
@contextmanager
def input_as_png(input_file: Path):
if not input_file.name.endswith('.png'):
with Image.open(input_file) as im:
bio = BytesIO()
im.save(bio, format='png')
bio.seek(0)
yield bio
else:
with open(input_file, 'rb') as f:
yield f
def quantize(input_file: Path, output_file: Path, quality_min: int, quality_max: int):
with input_as_png(input_file) as input_stream:
args = [
'pngquant',
'--force',
'--skip-if-larger',
'--quality',
f'{quality_min}-{quality_max}',
'--', # pngquant: stop processing arguments
'-', # pngquant: stream input and output
]
result = run(args, stdin=input_stream, stdout=PIPE, stderr=PIPE, check=False)
if result.returncode == 0:
# input_file could be the same as output_file, so we defer the write
output_file.write_bytes(result.stdout)
def quantize_mp(args):
return quantize(*args)

View File

@@ -0,0 +1,342 @@
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Interface to Tesseract executable"""
import logging
import os
import re
import shutil
from collections import namedtuple
from distutils.version import StrictVersion
from os import fspath
from pathlib import Path
from subprocess import PIPE, STDOUT, CalledProcessError, TimeoutExpired
from typing import List, Optional
from PIL import Image
from ocrmypdf.exceptions import (
MissingDependencyError,
SubprocessOutputError,
TesseractConfigError,
)
from ocrmypdf.subprocess import get_version, run
log = logging.getLogger(__name__)
OrientationConfidence = namedtuple('OrientationConfidence', ('angle', 'confidence'))
HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 4.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "_blank.png"; bbox 0 0 {0} {1}; ppageno 0'>
</div>
</body>
</html>
"""
class TesseractLoggerAdapter(logging.LoggerAdapter):
def process(self, msg, kwargs):
kwargs['extra'] = self.extra
return '[tesseract] %s' % (msg), kwargs
class TesseractVersion(StrictVersion):
version_re = re.compile(
r'''
^(\d+) \. (\d+) (\. (\d+))? # groups: 1/major, 2/minor, 3/[skip], 4/patch
[-]? # optional hyphen separator
(?:(alpha|beta|rc|dev)[.\-\ ]?(\d+)?)? # 5/prerelease, 6/prerelease_num
(?:-(\d+)-g[0-9a-f]+)? # untagged git version
$
''',
re.VERBOSE | re.ASCII,
)
def parse(self, vstring):
try:
super().parse(vstring)
except TypeError as e:
if 'int() argument must be a string' in str(e):
super().parse(vstring + '0')
def version():
return get_version('tesseract', regex=r'tesseract\s(.+)')
def has_user_words():
"""Does Tesseract have --user-words capability?
Not available in 4.0, but available in 4.1. Also available in 3.x, but
we no longer support 3.x.
"""
return version() >= '4.1'
def get_languages():
def lang_error(output):
msg = (
"Tesseract failed to report available languages.\n"
"Output from Tesseract:\n"
"-----------\n"
)
msg += output
return msg
args_tess = ['tesseract', '--list-langs']
try:
proc = run(
args_tess,
text=True,
stdout=PIPE,
stderr=STDOUT,
logs_errors_to_stdout=True,
check=True,
)
output = proc.stdout
except CalledProcessError as e:
raise MissingDependencyError(lang_error(e.output)) from e
for line in output.splitlines():
if line.startswith('Error'):
raise MissingDependencyError(lang_error(output))
_header, *rest = output.splitlines()
return set(lang.strip() for lang in rest)
def tess_base_args(langs: List[str], engine_mode: Optional[int]) -> List[str]:
args = ['tesseract']
if langs:
args.extend(['-l', '+'.join(langs)])
if engine_mode is not None:
args.extend(['--oem', str(engine_mode)])
return args
def get_orientation(input_file: Path, engine_mode: Optional[int], timeout: float):
args_tesseract = tess_base_args(['osd'], engine_mode) + [
'--psm',
'0',
fspath(input_file),
'stdout',
]
try:
p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
stdout = p.stdout
except TimeoutExpired:
return OrientationConfidence(angle=0, confidence=0.0)
except CalledProcessError as e:
tesseract_log_output(e.stdout)
tesseract_log_output(e.stderr)
if (
b'Too few characters. Skipping this page' in e.output
or b'Image too large' in e.output
):
return OrientationConfidence(0, 0)
raise SubprocessOutputError() from e
else:
osd = {}
for line in stdout.decode().splitlines():
line = line.strip()
parts = line.split(':', maxsplit=2)
if len(parts) == 2:
osd[parts[0].strip()] = parts[1].strip()
angle = int(osd.get('Orientation in degrees', 0))
oc = OrientationConfidence(
angle=angle, confidence=float(osd.get('Orientation confidence', 0))
)
return oc
def tesseract_log_output(stream):
tlog = TesseractLoggerAdapter(
log, extra=log.extra if hasattr(log, 'extra') else None
)
if not stream:
return
try:
text = stream.decode()
except UnicodeDecodeError:
text = stream.decode('utf-8', 'ignore')
lines = text.splitlines()
for line in lines:
if line.startswith("Tesseract Open Source"):
continue
elif line.startswith("Warning in pixReadMem"):
continue
elif 'diacritics' in line:
tlog.warning("lots of diacritics - possibly poor OCR")
elif line.startswith('OSD: Weak margin'):
tlog.warning("unsure about page orientation")
elif 'Error in pixScanForForeground' in line:
pass # Appears to be spurious/problem with nonwhite borders
elif 'Error in boxClipToRectangle' in line:
pass # Always appears with pixScanForForeground message
elif 'parameter not found: ' in line.lower():
tlog.error(line.strip())
problem = line.split('found: ')[1]
raise TesseractConfigError(problem)
elif 'error' in line.lower() or 'exception' in line.lower():
tlog.error(line.strip())
elif 'warning' in line.lower():
tlog.warning(line.strip())
elif 'read_params_file' in line.lower():
tlog.error(line.strip())
else:
tlog.info(line.strip())
def page_timedout(timeout):
if timeout == 0:
return
log.warning("[tesseract] took too long to OCR - skipping")
def _generate_null_hocr(output_hocr, output_text, image):
"""Produce a .hocr file that reports no text detected on a page that is
the same size as the input image."""
with Image.open(image) as im:
w, h = im.size
output_hocr.write_text(HOCR_TEMPLATE.format(w, h), encoding='utf-8')
output_text.write_text('[skipped page]', encoding='utf-8')
def generate_hocr(
*,
input_file: Path,
output_hocr: Path,
output_text: Path,
languages: List[str],
engine_mode: int,
tessconfig: List[str],
timeout: float,
pagesegmode: int,
user_words,
user_patterns,
):
prefix = output_hocr.with_suffix('')
args_tesseract = tess_base_args(languages, engine_mode)
if pagesegmode is not None:
args_tesseract.extend(['--psm', str(pagesegmode)])
if user_words:
args_tesseract.extend(['--user-words', user_words])
if user_patterns:
args_tesseract.extend(['--user-patterns', user_patterns])
# Reminder: test suite tesseract test plugins will break after any changes
# to the number of order parameters here
args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig)
try:
p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
stdout = p.stdout
except TimeoutExpired:
# Generate a HOCR file with no recognized text if tesseract times out
# Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file.
page_timedout(timeout)
_generate_null_hocr(output_hocr, output_text, input_file)
except CalledProcessError as e:
tesseract_log_output(e.output)
if b'Image too large' in e.output:
_generate_null_hocr(output_hocr, output_text, input_file)
return
raise SubprocessOutputError() from e
else:
tesseract_log_output(stdout)
# The sidecar text file will get the suffix .txt; rename it to
# whatever caller wants it named
if prefix.with_suffix('.txt').exists():
shutil.move(prefix.with_suffix('.txt'), output_text)
def use_skip_page(output_pdf, output_text):
output_text.write_text('[skipped page]', encoding='utf-8')
# A 0 byte file to the output to indicate a skip
output_pdf.write_bytes(b'')
def generate_pdf(
*,
input_file: Path,
output_pdf: Path,
output_text: Path,
languages: List[str],
engine_mode: int,
tessconfig: List[str],
timeout: float,
pagesegmode: int,
user_words,
user_patterns,
):
"""Use Tesseract to render a PDF.
input_file -- image to analyze
output_pdf -- file to generate
output_text -- OCR text file
languages -- list of languages to consider
engine_mode -- engine mode argument for tess v4
tessconfig -- tesseract configuration
timeout -- timeout (seconds)
"""
args_tesseract = tess_base_args(languages, engine_mode)
if pagesegmode is not None:
args_tesseract.extend(['--psm', str(pagesegmode)])
args_tesseract.extend(['-c', 'textonly_pdf=1'])
if user_words:
args_tesseract.extend(['--user-words', user_words])
if user_patterns:
args_tesseract.extend(['--user-patterns', user_patterns])
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
# Reminder: test suite tesseract test plugins might break after any changes
# to the number of order parameters here
args_tesseract.extend([input_file, prefix, 'pdf', 'txt'] + tessconfig)
try:
p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
stdout = p.stdout
if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_text)
except TimeoutExpired:
page_timedout(timeout)
use_skip_page(output_pdf, output_text)
except CalledProcessError as e:
tesseract_log_output(e.output)
if b'Image too large' in e.output:
use_skip_page(output_pdf, output_text)
return
raise SubprocessOutputError() from e
else:
tesseract_log_output(stdout)

View File

@@ -0,0 +1,134 @@
# © 2015 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# unpaper documentation:
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
"""Interface to unpaper executable"""
import logging
import os
import shlex
from decimal import Decimal
from pathlib import Path
from subprocess import PIPE, STDOUT
from tempfile import TemporaryDirectory
from typing import List, Optional, Tuple, Union
from PIL import Image
from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
from ocrmypdf.subprocess import get_version
from ocrmypdf.subprocess import run as external_run
DecFloat = Union[Decimal, float]
log = logging.getLogger(__name__)
def version() -> str:
return get_version('unpaper')
def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]:
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
with Image.open(input_file) as im:
im_modified = False
if im.mode not in SUFFIXES:
log.info("Converting image to other colorspace")
try:
if im.mode == 'P' and len(im.getcolors()) == 2:
im = im.convert(mode='1')
else:
im = im.convert(mode='RGB')
except IOError as e:
raise MissingDependencyError(
"Could not convert image with type " + im.mode
) from e
else:
im_modified = True
try:
suffix = SUFFIXES[im.mode]
except KeyError:
raise MissingDependencyError(
"Failed to convert image to a supported format."
) from None
if im_modified or input_file.suffix != '.pnm':
input_pnm = tmpdir / 'input.pnm'
im.save(input_pnm, format='PPM')
else:
# No changes, PNG input, just use the file we already have
input_pnm = input_file
output_pnm = tmpdir / f'output{suffix}'
return input_pnm, output_pnm
def run(
input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: List[str]
) -> None:
args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args
with TemporaryDirectory() as tmpdir:
input_pnm, output_pnm = _setup_unpaper_io(Path(tmpdir), input_file)
# To prevent any shenanigans from accepting arbitrary parameters in
# --unpaper-args, we:
# 1) run with cwd set to a tmpdir with only unpaper's files
# 2) forbid the use of '/' in arguments, to prevent changing paths
# 3) append absolute paths for the input and output file
# This should ensure that a user cannot clobber some other file with
# their unpaper arguments (whether intentionally or otherwise)
args_unpaper.extend([os.fspath(input_pnm), os.fspath(output_pnm)])
external_run(
args_unpaper,
close_fds=True,
check=True,
stderr=STDOUT, # unpaper writes logging output to stdout and stderr
stdout=PIPE, # and cannot send file output to stdout
cwd=tmpdir,
logs_errors_to_stdout=True,
)
try:
with Image.open(output_pnm) as imout:
imout.save(output_file, dpi=(dpi, dpi))
except (FileNotFoundError, OSError):
raise SubprocessOutputError(
"unpaper: failed to produce the expected output file. "
+ " Called with: "
+ str(args_unpaper)
) from None
def validate_custom_args(args: str) -> List[str]:
unpaper_args = shlex.split(args)
if any(('/' in arg or arg == '.' or arg == '..') for arg in unpaper_args):
raise ValueError('No filenames allowed in --unpaper-args')
return unpaper_args
def clean(
input_file: Path,
output_file: Path,
*,
dpi: DecFloat,
unpaper_args: Optional[List[str]] = None,
):
default_args = [
'--layout',
'none',
'--mask-scan-size',
'100', # don't blank out narrow columns
'--no-border-align', # don't align visible content to borders
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas
'--no-blackfilter', # don't remove solid black areas
'--no-deskew', # don't deskew
]
if not unpaper_args:
unpaper_args = default_args
run(input_file, output_file, dpi=dpi, mode_args=unpaper_args)

314
src/ocrmypdf/_graft.py Normal file
View File

@@ -0,0 +1,314 @@
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import uuid
from contextlib import suppress
from pathlib import Path
from typing import Optional
import pikepdf
from pikepdf.objects import Dictionary, Name
log = logging.getLogger(__name__)
MAX_REPLACE_PAGES = 100
def _ensure_dictionary(obj, name):
if name not in obj:
obj[name] = Dictionary({})
return obj[name]
def _update_resources(*, obj, font, font_key, procset):
"""Update this obj's fonts with a reference to the Glyphless font.
obj can be a page or Form XObject.
"""
resources = _ensure_dictionary(obj, Name.Resources)
fonts = _ensure_dictionary(resources, Name.Font)
if font_key is not None and font_key not in fonts:
fonts[font_key] = font
# Reassign /ProcSet to one that just lists everything - ProcSet is
# obsolete and doesn't matter but recommended for old viewer support
if procset:
resources['/ProcSet'] = procset
def strip_invisible_text(pdf, page):
stream = []
in_text_obj = False
render_mode = 0
text_objects = []
page.page_contents_coalesce()
for operands, operator in pikepdf.parse_content_stream(page, ''):
if not in_text_obj:
if operator == pikepdf.Operator('BT'):
in_text_obj = True
render_mode = 0
text_objects.append((operands, operator))
else:
stream.append((operands, operator))
else:
if operator == pikepdf.Operator('Tr'):
render_mode = operands[0]
text_objects.append((operands, operator))
if operator == pikepdf.Operator('ET'):
in_text_obj = False
if render_mode != 3:
stream.extend(text_objects)
text_objects.clear()
def convert(op):
try:
return op.unparse()
except AttributeError:
return str(op).encode('ascii')
lines = []
for operands, operator in stream:
if operator == pikepdf.Operator('INLINE IMAGE'):
iim = operands[0]
line = iim.unparse()
else:
line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse()
lines.append(line)
content_stream = b'\n'.join(lines)
page.Contents = pikepdf.Stream(pdf, content_stream)
class OcrGrafter:
def __init__(self, context):
self.context = context
self.path_base = context.origin
self.pdf_base = pikepdf.open(self.path_base)
self.font, self.font_key = None, None
self.pdfinfo = context.pdfinfo
self.output_file = context.get_path('graft_layers.pdf')
self.procset = self.pdf_base.make_indirect(
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
)
self.emplacements = 1
self.interim_count = 0
def graft_page(
self,
*,
pageno: int,
image: Optional[Path],
textpdf: Optional[Path],
autorotate_correction: int,
):
if textpdf and not self.font:
self.font, self.font_key = self._find_font(textpdf)
emplaced_page = False
content_rotation = self.pdfinfo[pageno].rotation
path_image = Path(image).resolve() if image else None
if path_image is not None and path_image != self.path_base:
# We are updating the old page with a rasterized PDF of the new
# page (without changing objgen, to preserve references)
log.debug("Emplacement update")
with pikepdf.open(image) as pdf_image:
self.emplacements += 1
foreign_image_page = pdf_image.pages[0]
self.pdf_base.pages.append(foreign_image_page)
local_image_page = self.pdf_base.pages[-1]
self.pdf_base.pages[pageno].emplace(local_image_page)
del self.pdf_base.pages[-1]
emplaced_page = True
# Calculate if the text is misaligned compared to the content
if emplaced_page:
content_rotation = autorotate_correction
text_rotation = autorotate_correction
text_misaligned = (text_rotation - content_rotation) % 360
log.debug(
f"Text rotation: (text, autorotate, content) -> text misalignment = "
f"({text_rotation}, {autorotate_correction}, {content_rotation}) -> {text_misaligned}"
)
if textpdf and self.font:
# Graft the text layer onto this page, whether new or old, possibly
# rotating the text layer by the amount is misaligned.
strip_old = self.context.options.redo_ocr
self._graft_text_layer(
page_num=pageno + 1,
textpdf=textpdf,
font=self.font,
font_key=self.font_key,
text_rotation=text_misaligned,
procset=self.procset,
strip_old_text=strip_old,
)
# Correct the overall page rotation if needed, now that the text and content
# are aligned
page_rotation = (content_rotation - autorotate_correction) % 360
self.pdf_base.pages[pageno].Rotate = page_rotation
log.debug(
f"Page rotation: (content, auto) -> page = "
f"({content_rotation}, {autorotate_correction}) -> {page_rotation}"
)
if self.emplacements % MAX_REPLACE_PAGES == 0:
self.save_and_reload()
def save_and_reload(self):
"""Save and reload the Pdf.
This will keep a lid on our memory usage for very large files. Attach
the font to page 1 even if page 1 doesn't use it, so we have a way to get it
back.
"""
page0 = self.pdf_base.pages[0]
_update_resources(
obj=page0, font=self.font, font_key=self.font_key, procset=self.procset
)
# We cannot read and write the same file, that will corrupt it
# but we don't to keep more copies than we need to. Delete intermediates.
# {interim_count} is the opened file we were updating
# {interim_count - 1} can be deleted
# {interim_count + 1} is the new file will produce and open
old_file = self.output_file.with_suffix(f'.working{self.interim_count - 1}.pdf')
if not self.context.options.keep_temporary_files:
with suppress(FileNotFoundError):
old_file.unlink()
next_file = self.output_file.with_suffix(
f'.working{self.interim_count + 1}.pdf'
)
self.pdf_base.save(next_file)
self.pdf_base.close()
self.pdf_base = pikepdf.open(next_file)
self.procset = self.pdf_base.pages[0].Resources.ProcSet
self.font, self.font_key = None, None # Ensure we reacquire this information
self.interim_count += 1
def finalize(self):
self.pdf_base.save(self.output_file)
self.pdf_base.close()
return self.output_file
def _find_font(self, text):
"""Copy a font from the filename text into pdf_base"""
font, font_key = None, None
possible_font_names = ('/f-0-0', '/F1')
try:
with pikepdf.open(text) as pdf_text:
try:
pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
except (AttributeError, IndexError, KeyError):
return None, None
pdf_text_font = None
for f in possible_font_names:
pdf_text_font = pdf_text_fonts.get(f, None)
if pdf_text_font is not None:
font_key = f
break
if pdf_text_font:
font = self.pdf_base.copy_foreign(pdf_text_font)
return font, font_key
except (FileNotFoundError, pikepdf.PdfError):
# PdfError occurs if a 0-length file is written e.g. due to OCR timeout
return None, None
def _graft_text_layer(
self,
*,
page_num: int,
textpdf: Path,
font: pikepdf.Object,
font_key: pikepdf.Object,
procset: pikepdf.Object,
text_rotation: int,
strip_old_text: bool,
):
"""Insert the text layer from text page 0 on to pdf_base at page_num"""
log.debug("Grafting")
if Path(textpdf).stat().st_size == 0:
return
# This is a pointer indicating a specific page in the base file
with pikepdf.open(textpdf) as pdf_text:
pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()
base_page = self.pdf_base.pages.p(page_num)
# The text page always will be oriented up by this stage but the original
# content may have a rotation applied. Wrap the text stream with a rotation
# so it will be oriented the same way as the rest of the page content.
# (Previous versions OCRmyPDF rotated the content layer to match the text.)
mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
# -rotation because the input is a clockwise angle and this formula
# uses CCW
text_rotation = -text_rotation % 360
rotate = pikepdf.PdfMatrix().rotated(text_rotation)
# Because of rounding of DPI, we might get a text layer that is not
# identically sized to the target page. Scale to adjust. Normally this
# is within 0.998.
if text_rotation in (90, 270):
wt, ht = ht, wt
scale_x = wp / wt
scale_y = hp / ht
# log.debug('%r', scale_x, scale_y)
scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
# Translate the text so it is centered at (0, 0), rotate it there, adjust
# for a size different between initial and text PDF, then untranslate, and
# finally move the lower left corner to match the mediabox
ctm = translate @ rotate @ scale @ untranslate @ corner
base_resources = _ensure_dictionary(base_page, Name.Resources)
base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
text_xobj_name = Name('/' + str(uuid.uuid4()))
xobj = self.pdf_base.make_stream(pdf_text_contents)
base_xobjs[text_xobj_name] = xobj
xobj.Type = Name.XObject
xobj.Subtype = Name.Form
xobj.FormType = 1
xobj.BBox = mediabox
_update_resources(
obj=xobj, font=font, font_key=font_key, procset=[Name.PDF]
)
pdf_draw_xobj = (
(b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'\nQ\n'
)
new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj)
if strip_old_text:
strip_invisible_text(self.pdf_base, base_page)
base_page.page_contents_add(new_text_layer, prepend=True)
_update_resources(
obj=base_page, font=font, font_key=font_key, procset=procset
)

View File

@@ -1,83 +1,103 @@
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import os
import shutil
import sys
from contextlib import suppress
from multiprocessing.managers import SyncManager
from argparse import Namespace
from copy import copy
from pathlib import Path
from typing import Iterator
from .pdfinfo import PdfInfo
from pluggy import PluginManager
from ocrmypdf.pdfinfo import PdfInfo
from ocrmypdf.pdfinfo.info import PageInfo
class JobContext:
"""Holds our context for a particular run of the pipeline
class PdfContext:
"""Holds the context for a particular run of the pipeline."""
A multiprocessing manager effectively creates a separate process
that keeps the master job context object. Other threads access
job context via multiprocessing proxy objects.
options: Namespace #: The specified options for processing this PDF.
origin: Path #: The filename of the original input file.
pdfinfo: PdfInfo #: Detailed data for this PDF.
plugin_manager: PluginManager #: PluginManager for processing the current PDF.
While this would naturally lend itself @property's it seems to make
a little more sense to use functions to make it explicitly that the
invocation requires marshalling data across a process boundary.
def __init__(
self,
options: Namespace,
work_folder: Path,
origin: Path,
pdfinfo: PdfInfo,
plugin_manager,
):
self.options = options
self.work_folder = work_folder
self.origin = origin
self.pdfinfo = pdfinfo
self.plugin_manager = plugin_manager
def get_path(self, name: str) -> Path:
"""Generate a ``Path`` for an intermediate file involved in processing.
The path will be in a temporary folder that is common for all processing
of this particular PDF.
"""
return self.work_folder / name
def get_page_contexts(self) -> Iterator['PageContext']:
"""Get all ``PageContext`` for this PDF."""
npages = len(self.pdfinfo)
for n in range(npages):
yield PageContext(self, n)
class PageContext:
"""Holds our context for a page.
Must be pickable, so stores only intrinsic/simple data elements or those
capable of their serializing themselves via ``__getstate__``.
"""
def __init__(self):
self.pdfinfo = None
self.options = None
self.work_folder = None
self.rotations = {}
options: Namespace #: The specified options for processing this PDF.
origin: Path #: The filename of the original input file.
pageno: int #: This page number (zero-based).
pageinfo: PageInfo #: Information on this page.
plugin_manager: PluginManager #: PluginManager for processing the current PDF.
def generate_pdfinfo(self, infile):
self.pdfinfo = PdfInfo(infile)
def __init__(self, pdf_context: PdfContext, pageno):
self.work_folder = pdf_context.work_folder
self.origin = pdf_context.origin
self.options = pdf_context.options
self.pageno = pageno
self.pageinfo = pdf_context.pdfinfo[pageno]
self.plugin_manager = pdf_context.plugin_manager
def get_pdfinfo(self):
"What we know about the input PDF"
return self.pdfinfo
def get_path(self, name: str) -> Path:
"""Generate a ``Path`` for a file that is part of processing this page.
def set_pdfinfo(self, pdfinfo):
self.pdfinfo = pdfinfo
The path will be based in a common temporary folder and have a prefix based
on the page number.
"""
return self.work_folder / ("%06d_%s" % (self.pageno + 1, name))
def get_options(self):
return self.options
def __getstate__(self):
state = self.__dict__.copy()
def set_options(self, options):
self.options = options
def get_work_folder(self):
return self.work_folder
def set_work_folder(self, work_folder):
self.work_folder = work_folder
def get_rotation(self, pageno):
return self.rotations.get(pageno, 0)
def set_rotation(self, pageno, value):
self.rotations[pageno] = value
state['options'] = copy(self.options)
if not isinstance(state['options'].input_file, (str, bytes, os.PathLike)):
state['options'].input_file = 'stream'
if not isinstance(state['options'].output_file, (str, bytes, os.PathLike)):
state['options'].output_file = 'stream'
return state
class JobContextManager(SyncManager):
pass
def cleanup_working_files(work_folder, options):
def cleanup_working_files(work_folder: Path, options: Namespace):
if options.keep_temporary_files:
print(f"Temporary working files saved at:\n{work_folder}", file=sys.stderr)
print(f"Temporary working files retained at:\n{work_folder}", file=sys.stderr)
else:
with suppress(FileNotFoundError):
shutil.rmtree(work_folder)
shutil.rmtree(work_folder, ignore_errors=True)

50
src/ocrmypdf/_logging.py Normal file
View File

@@ -0,0 +1,50 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import sys
from contextlib import suppress
from tqdm import tqdm
class PageNumberFilter(logging.Filter):
def filter(self, record):
pageno = getattr(record, 'pageno', None)
if isinstance(pageno, int):
record.pageno = f'{pageno:5d} '
elif pageno is None:
record.pageno = ''
return True
class TqdmConsole:
"""Wrapper to log messages in a way that is compatible with tqdm progress bar
This routes log messages through tqdm so that it can print them above the
progress bar, and then refresh the progress bar, rather than overwriting
it which looks messy.
For some reason Python 3.6 prints extra empty messages from time to time,
so we suppress those.
"""
def __init__(self, file):
self.file = file
self.py36 = sys.version_info[0:2] == (3, 6)
def write(self, msg):
# When no progress bar is active, tqdm.write() routes to print()
if self.py36:
if msg.strip() != '':
tqdm.write(msg.rstrip(), end='\n', file=self.file)
else:
tqdm.write(msg.rstrip(), end='\n', file=self.file)
def flush(self):
with suppress(AttributeError):
self.file.flush()

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,122 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
import importlib
import importlib.util
import pkgutil
import sys
from pathlib import Path
from typing import List, Tuple, Union
import pluggy
import ocrmypdf.builtin_plugins
from ocrmypdf import pluginspec
from ocrmypdf.cli import get_parser, plugins_only_parser
class OcrmypdfPluginManager(pluggy.PluginManager):
"""pluggy.PluginManager that can fork.
Capable of reconstructing itself in child workers.
Arguments:
setup_func: callback that initializes the plugin manager with all
standard plugins
"""
def __init__(
self,
*args,
plugins: List[Union[str, Path]],
builtins: bool = True,
**kwargs,
):
self.__init_args = args
self.__init_kwargs = kwargs
self.__plugins = plugins
self.__builtins = builtins
super().__init__(*args, **kwargs)
self.setup_plugins()
def __getstate__(self):
state = dict(
init_args=self.__init_args,
plugins=self.__plugins,
builtins=self.__builtins,
init_kwargs=self.__init_kwargs,
)
return state
def __setstate__(self, state):
self.__init__(
*state['init_args'],
plugins=state['plugins'],
builtins=state['builtins'],
**state['init_kwargs'],
)
def setup_plugins(self):
self.add_hookspecs(pluginspec)
# 1. Register builtins
if self.__builtins:
for module in sorted(
pkgutil.iter_modules(ocrmypdf.builtin_plugins.__path__)
):
name = f'ocrmypdf.builtin_plugins.{module.name}'
module = importlib.import_module(name)
self.register(module)
# 2. Install semfree if needed
try:
# pylint: disable=import-outside-toplevel
from multiprocessing.synchronize import SemLock
del SemLock
except ImportError:
self.register(importlib.import_module('ocrmypdf.extra_plugins.semfree'))
# 3. Register setuptools plugins
self.load_setuptools_entrypoints('ocrmypdf')
# 4. Register plugins specified on command line
for name in self.__plugins:
if isinstance(name, Path) or name.endswith('.py'):
# Import by filename
module_name = Path(name).stem
spec = importlib.util.spec_from_file_location(module_name, name)
module = importlib.util.module_from_spec(spec)
sys.modules[module_name] = module
spec.loader.exec_module(module)
else:
# Import by dotted module name
module = importlib.import_module(name)
self.register(module)
def get_plugin_manager(plugins: List[Union[str, Path]], builtins=True):
pm = OcrmypdfPluginManager(
project_name='ocrmypdf',
plugins=plugins,
builtins=builtins,
)
return pm
def get_parser_options_plugins(
args,
) -> Tuple[argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager]:
pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
plugin_manager = get_plugin_manager(pre_options.plugins)
parser = get_parser()
plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
options = parser.parse_args(args=args)
return parser, options, plugin_manager

426
src/ocrmypdf/_sync.py Normal file
View File

@@ -0,0 +1,426 @@
# © 2016 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import logging.handlers
import os
import sys
import threading
from functools import partial
from pathlib import Path
from tempfile import mkdtemp
from typing import List, NamedTuple, Optional, Tuple
import PIL
from ocrmypdf._concurrent import Executor, setup_executor
from ocrmypdf._graft import OcrGrafter
from ocrmypdf._jobcontext import PageContext, PdfContext, cleanup_working_files
from ocrmypdf._logging import PageNumberFilter
from ocrmypdf._pipeline import (
convert_to_pdfa,
copy_final,
create_ocr_image,
create_pdf_page_from_image,
create_visible_page_jpg,
generate_postscript_stub,
get_orientation_correction,
get_pdfinfo,
is_ocr_required,
merge_sidecars,
metadata_fixup,
ocr_engine_hocr,
ocr_engine_textonly_pdf,
optimize_pdf,
preprocess_clean,
preprocess_deskew,
preprocess_remove_background,
rasterize,
rasterize_preview,
render_hocr_page,
should_visible_page_image_use_jpg,
triage,
validate_pdfinfo_options,
)
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf._validation import (
check_requested_output_file,
create_input_file,
report_output_file_size,
)
from ocrmypdf.exceptions import ExitCode, ExitCodeException
from ocrmypdf.helpers import (
NeverRaise,
available_cpu_count,
check_pdf,
pikepdf_enable_mmap,
samefile,
)
from ocrmypdf.pdfa import file_claims_pdfa
log = logging.getLogger(__name__)
class PageResult(NamedTuple): # pylint: disable=inherit-non-class
pageno: int
pdf_page_from_image: Optional[Path]
ocr: Optional[Path]
text: Optional[Path]
orientation_correction: int
tls = threading.local()
tls.pageno = None
old_factory = logging.getLogRecordFactory()
def record_factory(*args, **kwargs):
record = old_factory(*args, **kwargs)
if hasattr(tls, 'pageno'):
record.pageno = tls.pageno
return record
logging.setLogRecordFactory(record_factory)
def preprocess(
page_context: PageContext,
image: Path,
remove_background: bool,
deskew: bool,
clean: bool,
) -> Path:
if remove_background:
image = preprocess_remove_background(image, page_context)
if deskew:
image = preprocess_deskew(image, page_context)
if clean:
image = preprocess_clean(image, page_context)
return image
def make_intermediate_images(
page_context: PageContext, orientation_correction: int
) -> Tuple[Path, Optional[Path]]:
options = page_context.options
ocr_image = preprocess_out = None
rasterize_out = rasterize(
page_context.origin,
page_context,
correction=orientation_correction,
remove_vectors=False,
)
if not any([options.clean, options.clean_final, options.remove_vectors]):
ocr_image = preprocess_out = preprocess(
page_context,
rasterize_out,
options.remove_background,
options.deskew,
clean=False,
)
else:
if not options.lossless_reconstruction:
preprocess_out = preprocess(
page_context,
rasterize_out,
options.remove_background,
options.deskew,
clean=options.clean_final,
)
if options.remove_vectors:
rasterize_ocr_out = rasterize(
page_context.origin,
page_context,
correction=orientation_correction,
remove_vectors=True,
output_tag='_ocr',
)
else:
rasterize_ocr_out = rasterize_out
if (
preprocess_out
and rasterize_ocr_out == rasterize_out
and options.clean == options.clean_final
):
# Optimization: image for OCR is identical to presentation image
ocr_image = preprocess_out
else:
ocr_image = preprocess(
page_context,
rasterize_ocr_out,
options.remove_background,
options.deskew,
clean=options.clean,
)
return ocr_image, preprocess_out
def exec_page_sync(page_context: PageContext):
options = page_context.options
tls.pageno = page_context.pageno + 1
if not is_ocr_required(page_context):
return PageResult(
pageno=page_context.pageno,
pdf_page_from_image=None,
ocr=None,
text=None,
orientation_correction=0,
)
orientation_correction = 0
if options.rotate_pages:
# Rasterize
rasterize_preview_out = rasterize_preview(page_context.origin, page_context)
orientation_correction = get_orientation_correction(
rasterize_preview_out, page_context
)
ocr_image, preprocess_out = make_intermediate_images(
page_context, orientation_correction
)
ocr_image_out = create_ocr_image(ocr_image, page_context)
pdf_page_from_image_out = None
if not options.lossless_reconstruction:
assert preprocess_out
visible_image_out = preprocess_out
if should_visible_page_image_use_jpg(page_context.pageinfo):
visible_image_out = create_visible_page_jpg(visible_image_out, page_context)
filtered_image = page_context.plugin_manager.hook.filter_page_image(
page=page_context, image_filename=visible_image_out
)
if filtered_image:
visible_image_out = filtered_image
pdf_page_from_image_out = create_pdf_page_from_image(
visible_image_out, page_context, orientation_correction
)
if options.pdf_renderer.startswith('hocr'):
(hocr_out, text_out) = ocr_engine_hocr(ocr_image_out, page_context)
ocr_out = render_hocr_page(hocr_out, page_context)
elif options.pdf_renderer == 'sandwich':
(ocr_out, text_out) = ocr_engine_textonly_pdf(ocr_image_out, page_context)
else:
raise NotImplementedError(f"pdf_renderer {options.pdf_renderer}")
return PageResult(
pageno=page_context.pageno,
pdf_page_from_image=pdf_page_from_image_out,
ocr=ocr_out,
text=text_out,
orientation_correction=orientation_correction,
)
def post_process(pdf_file, context: PdfContext, executor: Executor):
pdf_out = pdf_file
if context.options.output_type.startswith('pdfa'):
ps_stub_out = generate_postscript_stub(context)
pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context)
pdf_out = metadata_fixup(pdf_out, context)
return optimize_pdf(pdf_out, context, executor)
def worker_init(max_pixels: int):
# In Windows, child process will not inherit our change to this value in
# the parent process, so ensure workers get it set. Not needed when running
# threaded, but harmless to set again.
PIL.Image.MAX_IMAGE_PIXELS = max_pixels
pikepdf_enable_mmap()
def exec_concurrent(context: PdfContext, executor: Executor):
"""Execute the pipeline concurrently"""
# Run exec_page_sync on every page context
options = context.options
max_workers = min(len(context.pdfinfo), options.jobs)
if max_workers > 1:
log.info("Start processing %d pages concurrently", max_workers)
sidecars: List[Optional[Path]] = [None] * len(context.pdfinfo)
ocrgraft = OcrGrafter(context)
def update_page(result: PageResult, pbar):
try:
tls.pageno = result.pageno + 1
sidecars[result.pageno] = result.text
pbar.update()
ocrgraft.graft_page(
pageno=result.pageno,
image=result.pdf_page_from_image,
textpdf=result.ocr,
autorotate_correction=result.orientation_correction,
)
pbar.update()
finally:
tls.pageno = None
executor(
use_threads=options.use_threads,
max_workers=max_workers,
tqdm_kwargs=dict(
total=(2 * len(context.pdfinfo)),
desc='OCR' if options.tesseract_timeout > 0 else 'Image processing',
unit='page',
unit_scale=0.5,
disable=not options.progress_bar,
),
worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
task=exec_page_sync,
task_arguments=context.get_page_contexts(),
task_finished=update_page,
)
# Output sidecar text
if options.sidecar:
text = merge_sidecars(sidecars, context)
# Copy text file to destination
copy_final(text, options.sidecar, context)
# Merge layers to one single pdf
pdf = ocrgraft.finalize()
# PDF/A and metadata
log.info("Postprocessing...")
pdf = post_process(pdf, context, executor)
# Copy PDF file to destination
copy_final(pdf, options.output_file, context)
def configure_debug_logging(log_filename: Path, prefix: str = ''):
"""
Create a debug log file at a specified location.
Arguments:
log_filename: Where to the put the log file.
prefix: The logging domain prefix that should be sent to the log.
"""
log_file_handler = logging.FileHandler(log_filename, delay=True)
log_file_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'[%(asctime)s] - %(name)s - %(levelname)7s -%(pageno)s %(message)s'
)
log_file_handler.setFormatter(formatter)
log_file_handler.addFilter(PageNumberFilter())
logging.getLogger(prefix).addHandler(log_file_handler)
return log_file_handler
def run_pipeline(options, *, plugin_manager, api=False):
# Any changes to options will not take effect for options that are already
# bound to function parameters in the pipeline. (For example
# options.input_file, options.pdf_renderer are already bound.)
if not options.jobs:
options.jobs = available_cpu_count()
if not plugin_manager:
plugin_manager = get_plugin_manager(options.plugins)
work_folder = Path(mkdtemp(prefix="ocrmypdf.io."))
debug_log_handler = None
if (
(options.keep_temporary_files or options.verbose >= 1)
and not os.environ.get('PYTEST_CURRENT_TEST', '')
and not api
):
# Debug log for command line interface only with verbose output
# See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
# when pytest is running
debug_log_handler = configure_debug_logging(
Path(work_folder) / "debug.log"
) # pragma: no cover
pikepdf_enable_mmap()
executor = setup_executor(plugin_manager)
try:
check_requested_output_file(options)
start_input_file, original_filename = create_input_file(options, work_folder)
# Triage image or pdf
origin_pdf = triage(
original_filename, start_input_file, work_folder / 'origin.pdf', options
)
# Gather pdfinfo and create context
pdfinfo = get_pdfinfo(
origin_pdf,
executor=executor,
detailed_analysis=options.redo_ocr,
progbar=options.progress_bar,
max_workers=options.jobs if not options.use_threads else 1, # To help debug
check_pages=options.pages,
)
context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
# Validate options are okay for this pdf
validate_pdfinfo_options(context)
# Execute the pipeline
exec_concurrent(context, executor)
if options.output_file == '-':
log.info("Output sent to stdout")
elif (
hasattr(options.output_file, 'writable') and options.output_file.writable()
):
log.info("Output written to stream")
elif samefile(options.output_file, os.devnull):
pass # Say nothing when sending to dev null
else:
if options.output_type.startswith('pdfa'):
pdfa_info = file_claims_pdfa(options.output_file)
if pdfa_info['pass']:
log.info(
"Output file is a %s (as expected)", pdfa_info['conformance']
)
else:
log.warning(
"Output file is okay but is not PDF/A (seems to be %s)",
pdfa_info['conformance'],
)
return ExitCode.pdfa_conversion_failed
if not check_pdf(options.output_file):
log.warning('Output file: The generated PDF is INVALID')
return ExitCode.invalid_output_pdf
report_output_file_size(options, start_input_file, options.output_file)
except (KeyboardInterrupt if not api else NeverRaise) as e:
if options.verbose >= 1:
log.exception("KeyboardInterrupt")
else:
log.error("KeyboardInterrupt")
return ExitCode.ctrl_c
except (ExitCodeException if not api else NeverRaise) as e:
if str(e):
log.error("%s: %s", type(e).__name__, str(e))
else:
log.error(type(e).__name__)
return e.exit_code
except (Exception if not api else NeverRaise) as e: # pylint: disable=broad-except
log.exception("An exception occurred while executing the pipeline")
return ExitCode.other_error
finally:
if debug_log_handler:
try:
debug_log_handler.close()
log.removeHandler(debug_log_handler)
except EnvironmentError as e:
print(e, file=sys.stderr)
cleanup_working_files(work_folder, options)
return ExitCode.ok

431
src/ocrmypdf/_validation.py Normal file
View File

@@ -0,0 +1,431 @@
#!/usr/bin/env python3
# © 2015-17 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import locale
import logging
import os
import sys
import unicodedata
from pathlib import Path
from shutil import copyfileobj
from typing import List, Set, Tuple, Union
import pikepdf
import PIL
from ocrmypdf._exec import jbig2enc, pngquant, unpaper
from ocrmypdf._unicodefun import verify_python3_env
from ocrmypdf.exceptions import (
BadArgsError,
InputFileError,
MissingDependencyError,
OutputFileAccessError,
)
from ocrmypdf.helpers import (
is_file_writable,
is_iterable_notstr,
monotonic,
safe_symlink,
)
from ocrmypdf.subprocess import check_external_program
# -------------
# External dependencies
HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
DEFAULT_LANGUAGE = 'eng' # Enforce English hegemony
log = logging.getLogger(__name__)
# --------
# Critical environment tests
verify_python3_env()
def check_platform():
if os.name == 'nt' and sys.maxsize <= 2 ** 32: # pragma: no cover
# 32-bit interpreter on Windows
log.error(
"You are running OCRmyPDF in a 32-bit (x86) Python interpreter."
"Please use a 64-bit (x86-64) version of Python."
)
def check_options_languages(options, ocr_engine_languages):
if not options.languages:
options.languages = {DEFAULT_LANGUAGE}
system_lang = locale.getlocale()[0]
if system_lang and not system_lang.startswith('en'):
log.debug("No language specified; assuming --language %s", DEFAULT_LANGUAGE)
if not ocr_engine_languages:
return
if not options.languages.issubset(ocr_engine_languages):
msg = (
f"OCR engine does not have language data for the following "
"requested languages: \n"
)
for lang in options.languages - ocr_engine_languages:
msg += lang + '\n'
raise MissingDependencyError(msg)
def check_options_output(options):
is_latin = options.languages.issubset(HOCR_OK_LANGS)
if options.pdf_renderer.startswith('hocr') and not is_latin:
msg = (
"The 'hocr' PDF renderer is known to cause problems with one "
"or more of the languages in your document. Use "
"--pdf-renderer auto (the default) to avoid this issue."
)
log.warning(msg)
lossless_reconstruction = False
if not any(
(
options.deskew,
options.clean_final,
options.force_ocr,
options.remove_background,
)
):
lossless_reconstruction = True
options.lossless_reconstruction = lossless_reconstruction
if not options.lossless_reconstruction and options.redo_ocr:
raise BadArgsError(
"--redo-ocr is not currently compatible with --deskew, "
"--clean-final, and --remove-background"
)
def check_options_sidecar(options):
if options.sidecar == '\0':
if options.output_file == '-':
raise BadArgsError(
"--sidecar filename must be specified when output file is stdout."
)
options.sidecar = options.output_file + '.txt'
if options.sidecar == options.input_file or options.sidecar == options.output_file:
raise BadArgsError(
"--sidecar file must be different from the input and output files"
)
def check_options_preprocessing(options):
if options.clean_final:
options.clean = True
if options.unpaper_args and not options.clean:
raise BadArgsError("--clean is required for --unpaper-args")
if options.clean:
check_external_program(
program='unpaper',
package='unpaper',
version_checker=unpaper.version,
need_version='6.1',
required_for=['--clean, --clean-final'],
)
try:
if options.unpaper_args:
options.unpaper_args = unpaper.validate_custom_args(
options.unpaper_args
)
except Exception as e:
raise BadArgsError("--unpaper-args: " + str(e)) from e
def _pages_from_ranges(ranges: str) -> Set[int]:
if is_iterable_notstr(ranges):
return set(ranges)
pages: List[int] = []
page_groups = ranges.replace(' ', '').split(',')
for g in page_groups:
if not g:
continue
try:
start, end = g.split('-')
except ValueError:
pages.append(int(g) - 1)
else:
try:
new_pages = list(range(int(start) - 1, int(end)))
if not new_pages:
raise BadArgsError(f"invalid page subrange '{start}-{end}'")
pages.extend(new_pages)
except ValueError:
raise BadArgsError("invalid page range") from None
if not pages:
raise BadArgsError(
f"The string of page ranges '{ranges}' did not contain any recognizable "
f"page ranges."
)
if not monotonic(pages):
log.warning(
"List of pages to process contains duplicate pages, or pages that are "
"out of order"
)
if any(page < 0 for page in pages):
raise BadArgsError("pages refers to a page number less than 1")
log.debug("OCRing only these pages: %s", pages)
return set(pages)
def check_options_ocr_behavior(options):
exclusive_options = sum(
[
(1 if opt else 0)
for opt in (options.force_ocr, options.skip_text, options.redo_ocr)
]
)
if exclusive_options >= 2:
raise BadArgsError("Choose only one of --force-ocr, --skip-text, --redo-ocr.")
if options.pages:
options.pages = _pages_from_ranges(options.pages)
def check_options_optimizing(options):
if options.optimize >= 2:
check_external_program(
program='pngquant',
package='pngquant',
version_checker=pngquant.version,
need_version='2.0.1',
required_for='--optimize {2,3}',
)
if options.optimize >= 2:
# Although we use JBIG2 for optimize=1, don't nag about it unless the
# user is asking for more optimization
check_external_program(
program='jbig2',
package='jbig2enc',
version_checker=jbig2enc.version,
need_version='0.28',
required_for='--optimize {2,3} | --jbig2-lossy',
recommended=True if not options.jbig2_lossy else False,
)
if options.optimize == 0 and any(
[options.jbig2_lossy, options.png_quality, options.jpeg_quality]
):
log.warning(
"The arguments --jbig2-lossy, --png-quality, and --jpeg-quality "
"will be ignored because --optimize=0."
)
def check_options_advanced(options):
if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
'pdfa'
):
log.warning(
"--pdfa-image-compression argument only applies when "
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
)
def check_options_metadata(options):
docinfo = [options.title, options.author, options.keywords, options.subject]
for s in (m for m in docinfo if m):
for c in s:
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
raise ValueError(
"One of the metadata strings contains "
"an unsupported Unicode character: '{}' (U+{})".format(
c, hex(ord(c))[2:].upper()
)
)
def check_options_pillow(options):
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
if PIL.Image.MAX_IMAGE_PIXELS == 0:
PIL.Image.MAX_IMAGE_PIXELS = None
def _check_options(options, plugin_manager, ocr_engine_languages):
check_platform()
check_options_languages(options, ocr_engine_languages)
check_options_metadata(options)
check_options_output(options)
check_options_sidecar(options)
check_options_preprocessing(options)
check_options_ocr_behavior(options)
check_options_optimizing(options)
check_options_advanced(options)
check_options_pillow(options)
plugin_manager.hook.check_options(options=options)
def check_options(options, plugin_manager):
ocr_engine_languages = plugin_manager.hook.get_ocr_engine().languages(options)
_check_options(options, plugin_manager, ocr_engine_languages)
def check_closed_streams(options): # pragma: no cover
"""Work around Python issue with multiprocessing forking on closed streams
https://bugs.python.org/issue28326
Attempting to a fork/exec a new Python process when any of std{in,out,err}
are closed or not flushable for some reason may raise an exception.
Fix this by opening devnull if the handle seems to be closed. Do this
globally to avoid tracking places all places that fork.
Seems to be specific to multiprocessing.Process not all Python process
forkers.
The error actually occurs when the stream object is not flushable,
but replacing an open stream object that is not flushable with
/dev/null is a bad idea since it will create a silent failure. Replacing
a closed handle with /dev/null seems safe.
"""
if sys.version_info[0:3] >= (3, 6, 4):
return True # Issued fixed in Python 3.6.4+
if sys.stderr is None:
sys.stderr = open(os.devnull, 'w')
if sys.stdin is None:
if options.input_file == '-':
log.error("Trying to read from stdin but stdin seems closed")
return False
sys.stdin = open(os.devnull, 'r')
if sys.stdout is None:
if options.output_file == '-':
# Can't replace stdout if the user is piping
# If this case can even happen, it must be some kind of weird
# stream.
log.error(
"Output was set to stdout '-' but the stream attached to "
"stdout does not support the flush() system call. This "
"will fail."
)
return False
sys.stdout = open(os.devnull, 'w')
return True
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
if options.input_file == '-':
# stdin
log.info('reading file from standard input')
target = work_folder / 'stdin'
with open(target, 'wb') as stream_buffer:
copyfileobj(sys.stdin.buffer, stream_buffer)
return target, "stdin"
elif hasattr(options.input_file, 'readable'):
if not options.input_file.readable():
raise InputFileError("Input file stream is not readable")
log.info('reading file from input stream')
target = work_folder / 'stream'
with open(target, 'wb') as stream_buffer:
copyfileobj(options.input_file, stream_buffer)
return target, "stream"
else:
try:
target = work_folder / 'origin'
safe_symlink(options.input_file, target)
return target, os.fspath(options.input_file)
except FileNotFoundError:
msg = f"File not found - {options.input_file}"
if Path('/.dockerenv').exists(): # pragma: no cover
msg += (
"\nDocker cannot your working directory unless you "
"explicitly share it with the Docker container and set up"
"permissions correctly.\n"
"You may find it easier to use stdin/stdout:"
"\n"
"\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\n"
)
raise InputFileError(msg)
def check_requested_output_file(options):
if options.output_file == '-':
if sys.stdout.isatty():
raise BadArgsError(
"Output was set to stdout '-' but it looks like stdout "
"is connected to a terminal. Please redirect stdout to a "
"file."
)
elif hasattr(options.output_file, 'writable'):
if not options.output_file.writable():
raise OutputFileAccessError("Output stream is not writable")
elif not is_file_writable(options.output_file):
raise OutputFileAccessError(
f"Output file location ({options.output_file}) is not a writable file."
)
def report_output_file_size(options, input_file, output_file):
try:
output_size = Path(output_file).stat().st_size
input_size = Path(input_file).stat().st_size
except FileNotFoundError:
return # Outputting to stream or something
with pikepdf.open(output_file) as p:
# Overhead constants obtained by estimating amount of data added by OCR
# PDF/A conversion, and possible XMP metadata addition, with compression
FILE_OVERHEAD = 4000
OCR_PER_PAGE_OVERHEAD = 3000
reasonable_overhead = FILE_OVERHEAD + OCR_PER_PAGE_OVERHEAD * len(p.pages)
ratio = output_size / input_size
reasonable_ratio = output_size / (input_size + reasonable_overhead)
if reasonable_ratio < 1.35 or input_size < 25000:
return # Seems fine
reasons = []
image_preproc = {
'deskew',
'clean_final',
'remove_background',
'oversample',
'force_ocr',
}
for arg in image_preproc:
if getattr(options, arg, False):
reasons.append(
f"The argument --{arg.replace('_', '-')} was issued, causing transcoding."
)
if options.optimize == 0:
reasons.append("Optimization was disabled.")
else:
image_optimizers = {
'jbig2': jbig2enc.available(),
'pngquant': pngquant.available(),
}
for name, available in image_optimizers.items():
if not available:
reasons.append(
f"The optional dependency '{name}' was not found, so some image "
f"optimizations could not be attempted."
)
if options.output_type.startswith('pdfa'):
reasons.append("PDF/A conversion was enabled. (Try `--output-type pdf`.)")
if options.plugins:
reasons.append("Plugins were used.")
if reasons:
explanation = "Possible reasons for this include:\n" + '\n'.join(reasons) + "\n"
else:
explanation = "No reason for this increase is known. Please report this issue."
log.warning(
f"The output file size is {ratio:.2f}× larger than the input file.\n"
f"{explanation}"
)

13
src/ocrmypdf/_version.py Normal file
View File

@@ -0,0 +1,13 @@
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import pkg_resources
PROGRAM_NAME = 'ocrmypdf'
# Official PEP 396
__version__ = pkg_resources.get_distribution('ocrmypdf').version

View File

@@ -1,332 +0,0 @@
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from contextlib import suppress
from itertools import groupby
from pathlib import Path
import os
import pikepdf
from .exec import tesseract
from .helpers import flatten_groups, page_number
MAX_REPLACE_PAGES = int(os.environ.get('_OCRMYPDF_MAX_REPLACE_PAGES', 100))
def _update_page_resources(*, page, font, font_key, procset):
"""Update this page's fonts with a reference to the Glyphless font"""
if '/Resources' not in page:
page['/Resources'] = pikepdf.Dictionary({})
resources = page['/Resources']
try:
fonts = resources['/Font']
except KeyError:
fonts = pikepdf.Dictionary({})
if font_key is not None and font_key not in fonts:
fonts[font_key] = font
resources['/Font'] = fonts
# Reassign /ProcSet to one that just lists everything - ProcSet is
# obsolete and doesn't matter but recommended for old viewer support
resources['/ProcSet'] = procset
def strip_invisible_text(pdf, page, log):
stream = []
in_text_obj = False
render_mode = 0
text_objects = []
page.page_contents_coalesce()
for operands, operator in pikepdf.parse_content_stream(page, ''):
if not in_text_obj:
if operator == pikepdf.Operator('BT'):
in_text_obj = True
render_mode = 0
text_objects.append((operands, operator))
else:
stream.append((operands, operator))
else:
if operator == pikepdf.Operator('Tr'):
render_mode = operands[0]
text_objects.append((operands, operator))
if operator == pikepdf.Operator('ET'):
in_text_obj = False
if render_mode != 3:
stream.extend(text_objects)
text_objects.clear()
def convert(op):
try:
return op.unparse()
except AttributeError:
return str(op).encode('ascii')
lines = []
for operands, operator in stream:
if operator == pikepdf.Operator('INLINE IMAGE'):
iim = operands[0]
line = iim.unparse()
else:
line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse()
lines.append(line)
content_stream = b'\n'.join(lines)
page.Contents = pikepdf.Stream(pdf, content_stream)
def _weave_layers_graft(
*, pdf_base, page_num, text, font, font_key, procset, rotation, strip_old_text, log
):
"""Insert the text layer from text page 0 on to pdf_base at page_num"""
log.debug("Grafting")
if Path(text).stat().st_size == 0:
return
# This is a pointer indicating a specific page in the base file
pdf_text = pikepdf.open(text)
pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()
if not tesseract.has_textonly_pdf():
# If we don't have textonly_pdf, edit the stream to delete the
# instruction to draw the image Tesseract generated, which we do not
# use.
stream = bytearray(pdf_text_contents)
pattern = b'/Im1 Do'
idx = stream.find(pattern)
stream[idx : (idx + len(pattern))] = b' ' * len(pattern)
pdf_text_contents = bytes(stream)
base_page = pdf_base.pages.p(page_num)
# The text page always will be oriented up by this stage but the original
# content may have a rotation applied. Wrap the text stream with a rotation
# so it will be oriented the same way as the rest of the page content.
# (Previous versions OCRmyPDF rotated the content layer to match the text.)
mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
# -rotation because the input is a clockwise angle and this formula
# uses CCW
rotation = -rotation % 360
rotate = pikepdf.PdfMatrix().rotated(rotation)
# Because of rounding of DPI, we might get a text layer that is not
# identically sized to the target page. Scale to adjust. Normally this
# is within 0.998.
if rotation in (90, 270):
wt, ht = ht, wt
scale_x = wp / wt
scale_y = hp / ht
log.debug('%r', (scale_x, scale_y))
scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
# Translate the text so it is centered at (0, 0), rotate it there, adjust
# for a size different between initial and text PDF, then untranslate
ctm = translate @ rotate @ scale @ untranslate
pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n'
new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents)
if strip_old_text:
strip_invisible_text(pdf_base, base_page, log)
base_page.page_contents_add(new_text_layer, prepend=True)
_update_page_resources(
page=base_page, font=font, font_key=font_key, procset=procset
)
pdf_text.close()
def _find_font(text, pdf_base):
"""Copy a font from the filename text into pdf_base"""
font, font_key = None, None
possible_font_names = ('/f-0-0', '/F1')
try:
with pikepdf.open(text) as pdf_text:
try:
pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
except (AttributeError, IndexError, KeyError):
return None, None
for f in possible_font_names:
pdf_text_font = pdf_text_fonts.get(f, None)
if pdf_text_font is not None:
font_key = f
break
if pdf_text_font:
font = pdf_base.copy_foreign(pdf_text_font)
return font, font_key
except (FileNotFoundError, pikepdf.PdfError):
# PdfError occurs if a 0-length file is written e.g. due to OCR timeout
return None, None
def weave_layers(infiles, output_file, log, context):
"""Apply text layer and/or image layer changes to baseline file
This is where the magic happens. infiles will be the main PDF to modify,
and optional .text.pdf and .image-layer.pdf files, organized however ruffus
organizes them.
From .text.pdf, we copy the content stream (which contains the Tesseract
OCR results), and rotate it into place. The first time we do this, we also
copy the GlyphlessFont, and then reference that font again.
For .image-layer.pdf, we check if this is a "pointer" to the original file,
or a new file. If a new file, we replace the page and remember that we
replaced this page.
Every 100 open files, we save intermediate results, to avoid any resource
limits, since pikepdf/qpdf need to keep a lot of open file handles in the
background. When objects are copied from one file to another qpdf, qpdf
doesn't actually copy the data until asked to write, so all the resources
it may need to remain available.
For completeness, we set up a /ProcSet on every page, although it's
unlikely any PDF viewer cares about this anymore.
"""
def input_sorter(key):
try:
return page_number(key)
except ValueError:
return -1
flat_inputs = sorted(flatten_groups(infiles), key=input_sorter)
groups = groupby(flat_inputs, key=input_sorter)
# Extract first item
_, basegroup = next(groups)
base = list(basegroup)[0]
path_base = Path(base).resolve()
pdf_base = pikepdf.open(path_base)
font, font_key, procset = None, None, None
pdfinfo = context.get_pdfinfo()
procset = pdf_base.make_indirect(
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
)
emplacements = 1
interim_count = 0
# Iterate rest
for page_num, layers in groups:
layers = list(layers)
log.debug(page_num)
log.debug(layers)
text = next((ii for ii in layers if ii.endswith('.text.pdf')), None)
image = next((ii for ii in layers if ii.endswith('.image-layer.pdf')), None)
if text and not font:
font, font_key = _find_font(text, pdf_base)
emplaced_page = False
content_rotation = pdfinfo[page_num - 1].rotation
path_image = Path(image).resolve() if image else None
if path_image is not None and path_image != path_base:
# We are updating the old page with a rasterized PDF of the new
# page (without changing objgen, to preserve references)
log.debug("Emplacement update")
with pikepdf.open(image) as pdf_image:
emplacements += 1
foreign_image_page = pdf_image.pages[0]
pdf_base.pages.append(foreign_image_page)
local_image_page = pdf_base.pages[-1]
pdf_base.pages[page_num - 1].emplace(local_image_page)
del pdf_base.pages[-1]
emplaced_page = True
autorotate_correction = context.get_rotation(page_num - 1)
if emplaced_page:
content_rotation = autorotate_correction
text_rotation = autorotate_correction
text_misaligned = (text_rotation - content_rotation) % 360
log.debug(
'%r',
[text_rotation, autorotate_correction, text_misaligned, content_rotation],
)
if text and font:
# Graft the text layer onto this page, whether new or old
strip_old = context.get_options().redo_ocr
_weave_layers_graft(
pdf_base=pdf_base,
page_num=page_num,
text=text,
font=font,
font_key=font_key,
rotation=text_misaligned,
procset=procset,
strip_old_text=strip_old,
log=log,
)
# Correct the rotation if applicable
pdf_base.pages[page_num - 1].Rotate = (
content_rotation - autorotate_correction
) % 360
if emplacements % MAX_REPLACE_PAGES == 0:
# Periodically save and reload the Pdf object. This will keep a
# lid on our memory usage for very large files. Attach the font to
# page 1 even if page 1 doesn't use it, so we have a way to get it
# back.
# TODO refactor this to outside the loop
page0 = pdf_base.pages[0]
_update_page_resources(
page=page0, font=font, font_key=font_key, procset=procset
)
# We cannot read and write the same file, that will corrupt it
# but we don't to keep more copies than we need to. Delete intermediates.
# {interim_count} is the opened file we were updateing
# {interim_count - 1} can be deleted
# {interim_count + 1} is the new file will produce and open
old_file = output_file + f'_working{interim_count - 1}.pdf'
if not context.get_options().keep_temporary_files:
with suppress(FileNotFoundError):
os.unlink(old_file)
next_file = output_file + f'_working{interim_count + 1}.pdf'
pdf_base.save(next_file)
pdf_base.close()
pdf_base = pikepdf.open(next_file)
procset = pdf_base.pages[0].Resources.ProcSet
font, font_key = None, None # Ensure we reacquire this information
interim_count += 1
pdf_base.save(output_file)
pdf_base.close()

340
src/ocrmypdf/api.py Normal file
View File

@@ -0,0 +1,340 @@
# © 2019 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import os
import sys
import threading
from enum import IntEnum
from io import IOBase
from pathlib import Path
from typing import AnyStr, BinaryIO, Iterable, Optional, Union
from warnings import warn
from ocrmypdf._logging import ( # pylint: disable=unused-import
PageNumberFilter,
TqdmConsole,
)
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf._sync import run_pipeline
from ocrmypdf._validation import check_options
from ocrmypdf.cli import ArgumentParser, get_parser
from ocrmypdf.helpers import is_iterable_notstr
try:
import coloredlogs
except ModuleNotFoundError:
coloredlogs = None
StrPath = Union[os.PathLike, AnyStr]
PathOrIO = Union[BinaryIO, StrPath]
_api_lock = threading.Lock()
class Verbosity(IntEnum):
"""Verbosity level for configure_logging."""
quiet = -1 #: Suppress most messages
default = 0 #: Default level of logging
debug = 1 #: Output ocrmypdf debug messages
debug_all = 2 #: More detailed debugging from ocrmypdf and dependent modules
def configure_logging(
verbosity: Verbosity,
*,
progress_bar_friendly: bool = True,
manage_root_logger: bool = False,
plugin_manager=None,
):
"""Set up logging.
Before calling :func:`ocrmypdf.ocr()`, you can use this function to
configure logging if you want ocrmypdf's output to look like the ocrmypdf
command line interface. It will register log handlers, log filters, and
formatters, configure color logging to standard error, and adjust the log
levels of third party libraries. Details of this are fine-tuned and subject
to change. The ``verbosity`` argument is equivalent to the argument
``--verbose`` and applies those settings. If you have a wrapper
script for ocrmypdf and you want it to be very similar to ocrmypdf, use this
function; if you are using ocrmypdf as part of an application that manages
its own logging, you probably do not want this function.
If this function is not called, ocrmypdf will not configure logging, and it
is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using
the Python standard library's logging module. If this function is called,
the caller may of course make further adjustments to logging.
Regardless of whether this function is called, ocrmypdf will perform all of
its logging under the ``"ocrmypdf"`` logging namespace. In addition,
ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library user
may wish to configure both; note that pdfminer is extremely chatty at the
log level ``logging.INFO``.
This function does not set up the ``debug.log`` log file that the command
line interface does at certain verbosity levels. Applications should configure
their own debug logging.
Args:
verbosity: Verbosity level.
progress_bar_friendly: If True (the default), install a custom log handler
that is compatible with progress bars and colored output.
manage_root_logger: Configure the process's root logger.
plugin_manager: The plugin manager, used for obtaining the custom log handler.
Returns:
The toplevel logger for ocrmypdf (or the root logger, if we are managing it).
"""
prefix = '' if manage_root_logger else 'ocrmypdf'
log = logging.getLogger(prefix)
log.setLevel(logging.DEBUG)
console = None
if plugin_manager and progress_bar_friendly:
console = plugin_manager.hook.get_logging_console()
if not console:
console = logging.StreamHandler(stream=sys.stderr)
if verbosity < 0:
console.setLevel(logging.ERROR)
elif verbosity >= 1:
console.setLevel(logging.DEBUG)
else:
console.setLevel(logging.INFO)
console.addFilter(PageNumberFilter())
if verbosity >= 2:
fmt = '%(levelname)7s %(name)s -%(pageno)s %(message)s'
else:
fmt = '%(pageno)s%(message)s'
use_colors = progress_bar_friendly
if not coloredlogs:
use_colors = False
if use_colors:
if os.name == 'nt':
use_colors = coloredlogs.enable_ansi_support()
if use_colors:
use_colors = coloredlogs.terminal_supports_colors()
if use_colors:
formatter = coloredlogs.ColoredFormatter(fmt=fmt)
else:
formatter = logging.Formatter(fmt=fmt)
console.setFormatter(formatter)
log.addHandler(console)
if verbosity <= 1:
pdfminer_log = logging.getLogger('pdfminer')
pdfminer_log.setLevel(logging.ERROR)
pil_log = logging.getLogger('PIL')
pil_log.setLevel(logging.INFO)
if manage_root_logger:
logging.captureWarnings(True)
return log
def create_options(
*, input_file: PathOrIO, output_file: PathOrIO, parser: ArgumentParser, **kwargs
):
cmdline = []
deferred = []
for arg, val in kwargs.items():
if val is None:
continue
# These arguments with special handling for which we bypass
# argparse
if arg in {'progress_bar', 'plugins'}:
deferred.append((arg, val))
continue
cmd_style_arg = arg.replace('_', '-')
# Booleans are special: add only if True, omit for False
if isinstance(val, bool):
if val:
cmdline.append(f"--{cmd_style_arg}")
continue
if is_iterable_notstr(val):
for elem in val:
cmdline.append(f"--{cmd_style_arg}")
cmdline.append(elem)
continue
# We have a parameter
cmdline.append(f"--{cmd_style_arg}")
if isinstance(val, (int, float)):
cmdline.append(str(val))
elif isinstance(val, str):
cmdline.append(val)
elif isinstance(val, Path):
cmdline.append(str(val))
else:
raise TypeError(f"{arg}: {val} ({type(val)})")
if isinstance(input_file, (BinaryIO, IOBase)):
cmdline.append('stream://input_file')
else:
cmdline.append(os.fspath(input_file))
if isinstance(output_file, (BinaryIO, IOBase)):
cmdline.append('stream://output_file')
else:
cmdline.append(os.fspath(output_file))
parser._api_mode = True
options = parser.parse_args(cmdline)
for keyword, val in deferred:
setattr(options, keyword, val)
if options.input_file == 'stream://input_file':
options.input_file = input_file
if options.output_file == 'stream://output_file':
options.output_file = output_file
return options
def ocr( # pylint: disable=unused-argument
input_file: PathOrIO,
output_file: PathOrIO,
*,
language: Iterable[str] = None,
image_dpi: int = None,
output_type=None,
sidecar: Optional[StrPath] = None,
jobs: int = None,
use_threads: bool = None,
title: str = None,
author: str = None,
subject: str = None,
keywords: str = None,
rotate_pages: bool = None,
remove_background: bool = None,
deskew: bool = None,
clean: bool = None,
clean_final: bool = None,
unpaper_args: str = None,
oversample: int = None,
remove_vectors: bool = None,
threshold: bool = None,
force_ocr: bool = None,
skip_text: bool = None,
redo_ocr: bool = None,
skip_big: float = None,
optimize: int = None,
jpg_quality: int = None,
png_quality: int = None,
jbig2_lossy: bool = None,
jbig2_page_group_size: int = None,
pages: str = None,
max_image_mpixels: float = None,
tesseract_config: Iterable[str] = None,
tesseract_pagesegmode: int = None,
tesseract_oem: int = None,
pdf_renderer=None,
tesseract_timeout: float = None,
rotate_pages_threshold: float = None,
pdfa_image_compression=None,
user_words: os.PathLike = None,
user_patterns: os.PathLike = None,
fast_web_view: float = None,
plugins: Iterable[StrPath] = None,
plugin_manager=None,
keep_temporary_files: bool = None,
progress_bar: bool = None,
**kwargs,
):
"""Run OCRmyPDF on one PDF or image.
For most arguments, see documentation for the equivalent command line parameter.
A few specific arguments are discussed here:
Args:
use_threads: Use worker threads instead of processes. This reduces
performance but may make debugging easier since it is easier to set
breakpoints.
input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
interpreted as file system path to the input file. If the object
appears to be a readable stream (with methods such as ``.read()``
and ``.seek()``), the object will be read in its entirety and saved to
a temporary file. If ``input_file`` is ``"-"``, standard input will be
read.
output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
interpreted as file system path to the output file. If the object
appears to be a writable stream (with methods such as ``.write()`` and
``.seek()``), the output will be written to this stream. If
``output_file`` is ``"-"``, the output will be written to ``sys.stdout``
(provided that standard output does not seem to be a terminal device).
When a stream is used as output, whether via a writable object or
``"-"``, some final validation steps are not performed (we do not read
back the stream after it is written).
Raises:
ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
with the OCR layer.
ocrmypdf.MissingDependencyError: If a required dependency program is missing or
was not found on PATH.
ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
could not be read, or some other file type that is not a PDF.
ocrmypdf.DpiError: If the input file is an image, but the resolution of the
image is not credible (allowing it to proceed would cause poor OCR).
ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
file failed.
ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
text already, and settings did not tell us to proceed.
ocrmypdf.InputFileError: Any other problem with the input file.
ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected).
OCRmyPDF does not remove passwords.
ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
valid.
Returns:
:class:`ocrmypdf.ExitCode`
"""
if plugins and plugin_manager:
raise ValueError("plugins= and plugin_manager are mutually exclusive")
if not plugins:
plugins = []
elif isinstance(plugins, (str, Path)):
plugins = [plugins]
else:
plugins = list(plugins)
# No new variable names should be assigned until these two steps are run
create_options_kwargs = {k: v for k, v in locals().items() if k != 'kwargs'}
create_options_kwargs.update(kwargs)
parser = get_parser()
create_options_kwargs['parser'] = parser
with _api_lock:
# We can't allow multiple ocrmypdf.ocr() threads to run in parallel, because
# they might install different plugins, and generally speaking we have areas
# of code that use global state.
if not plugin_manager:
plugin_manager = get_plugin_manager(plugins)
plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
if 'verbose' in kwargs:
warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().")
options = create_options(**create_options_kwargs)
check_options(options, plugin_manager)
return run_pipeline(options=options, plugin_manager=plugin_manager, api=True)

View File

@@ -0,0 +1,9 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# This file exists only mark builtin_plugins as a package.
# The plugin manager will not load it, so anything defined here may not be
# processed as a module.

View File

@@ -0,0 +1,172 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import logging.handlers
import multiprocessing
import os
import queue
import signal
import sys
import threading
from contextlib import suppress
from multiprocessing import Pool as ProcessPool
from multiprocessing.pool import ThreadPool
from typing import Callable, Iterable, Union
from tqdm import tqdm
from ocrmypdf import Executor, hookimpl
from ocrmypdf._logging import TqdmConsole
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import remove_all_log_handlers
Queue = Union[multiprocessing.Queue, queue.Queue]
def log_listener(q: Queue):
"""Listen to the worker processes and forward the messages to logging
For simplicity this is a thread rather than a process. Only one process
should actually write to sys.stderr or whatever we're using, so if this is
made into a process the main application needs to be directed to it.
See https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
"""
while True:
try:
record = q.get()
if record is None:
break
logger = logging.getLogger(record.name)
logger.handle(record)
except Exception: # pylint: disable=broad-except
import traceback # pylint: disable=import-outside-toplevel
print("Logging problem", file=sys.stderr)
traceback.print_exc(file=sys.stderr)
def process_sigbus(*args):
raise InputFileError("A worker process lost access to an input file")
def process_init(q: Queue, user_init: Callable[[], None], loglevel):
"""Initialize a process pool worker"""
# Ignore SIGINT (our parent process will kill us gracefully)
signal.signal(signal.SIGINT, signal.SIG_IGN)
# Install SIGBUS handler (so our parent process can abort somewhat gracefully)
with suppress(AttributeError): # Windows and Cygwin do not have SIGBUS
# Windows and Cygwin do not have pthread_sigmask or SIGBUS
signal.signal(signal.SIGBUS, process_sigbus)
# Remove any log handlers that belong to the parent process
root = logging.getLogger()
remove_all_log_handlers(root)
# Set up our single log handler to forward messages to the parent
root.setLevel(loglevel)
root.addHandler(logging.handlers.QueueHandler(q))
user_init()
return
def thread_init(_queue: Queue, user_init: Callable[[], None], _loglevel):
# As a thread, block SIGBUS so the main thread deals with it...
with suppress(AttributeError):
signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGBUS})
user_init()
return
class StandardExecutor(Executor):
def _execute(
self,
*,
use_threads: bool,
max_workers: int,
tqdm_kwargs: dict,
worker_initializer: Callable,
task: Callable,
task_arguments: Iterable,
task_finished: Callable,
):
if use_threads:
log_queue = queue.Queue(-1)
pool_class = ThreadPool
initializer = thread_init
else:
log_queue = multiprocessing.Queue(-1)
pool_class = ProcessPool
initializer = process_init
# Regardless of whether we use_threads for worker processes, the log_listener
# must be a thread. Make sure we create the listener after the worker pool,
# so that it does not get forked into the workers.
listener = threading.Thread(target=log_listener, args=(log_queue,))
listener.start()
with self.pbar_class(**tqdm_kwargs) as pbar:
pool = pool_class(
processes=max_workers,
initializer=initializer,
initargs=(log_queue, worker_initializer, logging.getLogger("").level),
)
try:
results = pool.imap_unordered(task, task_arguments)
for result in results:
if task_finished:
task_finished(result, pbar)
else:
pbar.update()
except KeyboardInterrupt:
# Terminate pool so we exit instantly
pool.terminate()
# Don't try listener.join() here, will deadlock
raise
except Exception:
if not os.environ.get("PYTEST_CURRENT_TEST", ""):
# Unless inside pytest, exit immediately because no one wants
# to wait for child processes to finalize results that will be
# thrown away. Inside pytest, we want child processes to exit
# cleanly so that they output an error messages or coverage data
# we need from them.
pool.terminate()
raise
finally:
# Terminate log listener
log_queue.put_nowait(None)
pool.close()
pool.join()
listener.join()
@hookimpl
def get_executor(progressbar_class):
return StandardExecutor(pbar_class=progressbar_class)
@hookimpl
def get_progressbar_class():
return tqdm
@hookimpl
def get_logging_console():
return logging.StreamHandler(stream=TqdmConsole(sys.stderr))

View File

@@ -0,0 +1,14 @@
# © 2021 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from ocrmypdf import hookimpl
@hookimpl
def filter_pdf_page(
page, image_filename, output_pdf
): # pylint: disable=unused-argument
return output_pdf

View File

@@ -0,0 +1,99 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
from ocrmypdf import hookimpl
from ocrmypdf._exec import ghostscript
from ocrmypdf._validation import HOCR_OK_LANGS
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.subprocess import check_external_program
log = logging.getLogger(__name__)
@hookimpl
def check_options(options):
gs_version = ghostscript.version()
check_external_program(
program='gs',
package='ghostscript',
version_checker=gs_version,
need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports
)
if gs_version in ('9.24', '9.51'):
raise MissingDependencyError(
f"Ghostscript {gs_version} contains serious regressions and is not "
"supported. Please upgrade to a newer version, or downgrade to the "
"previous version."
)
# We have these constraints to check for.
# 1. Ghostscript < 9.20 mangles multibyte Unicode
# 2. hocr doesn't work on non-Latin languages (so don't select it)
is_latin = options.languages.issubset(HOCR_OK_LANGS)
if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin:
# https://bugs.ghostscript.com/show_bug.cgi?id=696874
# Ghostscript < 9.20 fails to encode multibyte characters properly
log.warning(
f"The installed version of Ghostscript ({gs_version}) does not work "
"correctly with the OCR languages you specified. Use --output-type pdf or "
"upgrade to Ghostscript 9.20 or later to avoid this issue."
)
if options.output_type == 'pdfa':
options.output_type = 'pdfa-2'
if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
raise MissingDependencyError(
"--output-type pdfa-3 requires Ghostscript 9.19 or later"
)
@hookimpl
def rasterize_pdf_page(
input_file,
output_file,
raster_device,
raster_dpi,
pageno,
page_dpi,
rotation,
filter_vector,
):
ghostscript.rasterize_pdf(
input_file,
output_file,
raster_device=raster_device,
raster_dpi=raster_dpi,
pageno=pageno,
page_dpi=page_dpi,
rotation=rotation,
filter_vector=filter_vector,
)
return output_file
@hookimpl
def generate_pdfa(
pdf_pages,
pdfmark,
output_file,
compression,
pdf_version,
pdfa_part,
progressbar_class,
):
ghostscript.generate_pdfa(
pdf_pages=[*pdf_pages, pdfmark],
output_file=output_file,
compression=compression,
pdf_version=pdf_version,
pdfa_part=pdfa_part,
progressbar_class=progressbar_class,
)
return output_file

View File

@@ -0,0 +1,179 @@
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import os
from ocrmypdf import hookimpl
from ocrmypdf._exec import tesseract
from ocrmypdf.cli import numeric
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.helpers import clamp
from ocrmypdf.pluginspec import OcrEngine
from ocrmypdf.subprocess import check_external_program
log = logging.getLogger(__name__)
@hookimpl
def add_options(parser):
tess = parser.add_argument_group("Tesseract", "Advanced control of Tesseract OCR")
tess.add_argument(
'--tesseract-config',
action='append',
metavar='CFG',
default=[],
help="Additional Tesseract configuration files -- see documentation",
)
tess.add_argument(
'--tesseract-pagesegmode',
action='store',
type=int,
metavar='PSM',
choices=range(0, 14),
help="Set Tesseract page segmentation mode (see tesseract --help)",
)
tess.add_argument(
'--tesseract-oem',
action='store',
type=int,
metavar='MODE',
choices=range(0, 4),
help=(
"Set Tesseract 4.0 OCR engine mode: "
"0 - original Tesseract only; "
"1 - neural nets LSTM only; "
"2 - Tesseract + LSTM; "
"3 - default."
),
)
tess.add_argument(
'--tesseract-timeout',
default=180.0,
type=numeric(float, 0),
metavar='SECONDS',
help='Give up on OCR after the timeout, but copy the preprocessed page '
'into the final output',
)
tess.add_argument(
'--user-words',
metavar='FILE',
help="Specify the location of the Tesseract user words file. This is a "
"list of words Tesseract should consider while performing OCR in "
"addition to its standard language dictionaries. This can improve "
"OCR quality especially for specialized and technical documents.",
)
tess.add_argument(
'--user-patterns',
metavar='FILE',
help="Specify the location of the Tesseract user patterns file.",
)
@hookimpl
def check_options(options):
check_external_program(
program='tesseract',
package={'linux': 'tesseract-ocr'},
version_checker=tesseract.version,
need_version='4.0.0-beta.1', # using backport for Travis CI
version_parser=tesseract.TesseractVersion,
)
# Decide on what renderer to use
if options.pdf_renderer == 'auto':
options.pdf_renderer = 'sandwich'
if not tesseract.has_user_words() and (options.user_words or options.user_patterns):
log.warning(
"Tesseract 4.0 ignores --user-words and --user-patterns, so these "
"arguments have no effect."
)
if options.tesseract_pagesegmode in (0, 2):
log.warning(
"The --tesseract-pagesegmode argument you select will disable OCR. "
"This may cause processing to fail."
)
@hookimpl
def validate(pdfinfo, options):
# Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
# to manage how many threads it uses to avoid creating total threads than cores.
# Performance testing shows we're better off
# parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
# get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
# input file is small, then we allow Tesseract to use threads, subject to the
# constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
# As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
if not os.environ.get('OMP_THREAD_LIMIT', '').isnumeric():
tess_threads = clamp(options.jobs // len(pdfinfo), 1, 3)
os.environ['OMP_THREAD_LIMIT'] = str(tess_threads)
else:
tess_threads = int(os.environ['OMP_THREAD_LIMIT'])
log.debug("Using Tesseract OpenMP thread limit %d", tess_threads)
class TesseractOcrEngine(OcrEngine):
@staticmethod
def version():
return tesseract.version()
@staticmethod
def creator_tag(options):
tag = '-PDF' if options.pdf_renderer == 'sandwich' else ''
return f"Tesseract OCR{tag} {TesseractOcrEngine.version()}"
def __str__(self):
return f"Tesseract OCR {TesseractOcrEngine.version()}"
@staticmethod
def languages(options):
return tesseract.get_languages()
@staticmethod
def get_orientation(input_file, options):
return tesseract.get_orientation(
input_file,
engine_mode=options.tesseract_oem,
timeout=options.tesseract_timeout,
)
@staticmethod
def generate_hocr(input_file, output_hocr, output_text, options):
tesseract.generate_hocr(
input_file=input_file,
output_hocr=output_hocr,
output_text=output_text,
languages=options.languages,
engine_mode=options.tesseract_oem,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
user_words=options.user_words,
user_patterns=options.user_patterns,
)
@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
tesseract.generate_pdf(
input_file=input_file,
output_pdf=output_pdf,
output_text=output_text,
languages=options.languages,
engine_mode=options.tesseract_oem,
tessconfig=options.tesseract_config,
timeout=options.tesseract_timeout,
pagesegmode=options.tesseract_pagesegmode,
user_words=options.user_words,
user_patterns=options.user_patterns,
)
@hookimpl
def get_ocr_engine():
return TesseractOcrEngine()

486
src/ocrmypdf/cli.py Normal file
View File

@@ -0,0 +1,486 @@
# © 2015-19 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import argparse
from typing import Optional, Type, TypeVar
from ocrmypdf._version import PROGRAM_NAME as _PROGRAM_NAME
from ocrmypdf._version import __version__ as _VERSION
T = TypeVar('T')
def numeric(basetype: Type[T], min_: Optional[T] = None, max_: Optional[T] = None):
"""Validator for numeric params"""
min_ = basetype(min_) if min_ is not None else None
max_ = basetype(max_) if max_ is not None else None
def _numeric(string):
value = basetype(string)
if (min_ is not None and value < min_) or (max_ is not None and value > max_):
msg = "%r not in valid range %r" % (string, (min_, max_))
raise argparse.ArgumentTypeError(msg)
return value
_numeric.__name__ = basetype.__name__
return _numeric
class ArgumentParser(argparse.ArgumentParser):
"""Override parser's default behavior of calling sys.exit()
https://stackoverflow.com/questions/5943249/python-argparse-and-controlling-overriding-the-exit-status-code
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._api_mode = False
def error(self, message):
if not self._api_mode:
super().error(message)
return
raise ValueError(message)
class LanguageSetAction(argparse.Action):
def __init__(self, option_strings, dest, default=None, **kwargs):
if default is None:
default = set()
super().__init__(option_strings, dest, default=default, **kwargs)
def __call__(self, parser, namespace, values, option_string=None):
dest = getattr(namespace, self.dest)
if '+' in values:
dest.update(lang for lang in values.split('+'))
else:
dest.add(values)
def get_parser():
parser = ArgumentParser(
prog=_PROGRAM_NAME,
allow_abbrev=True,
fromfile_prefix_chars='@',
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""\
Generates a searchable PDF or PDF/A from a regular PDF.
OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
rotation and performs image processing, runs the Tesseract OCR engine on the
image, and then creates a PDF from the OCR information.
""",
epilog="""\
OCRmyPDF attempts to keep the output file at about the same size. If a file
contains losslessly compressed images, and images in the output file will be
losslessly compressed as well.
PDF is a page description file that attempts to preserve a layout exactly.
A PDF can contain vector objects (such as text or lines) and raster objects
(images). A page might have multiple images. OCRmyPDF is prepared to deal
with the wide variety of PDFs that exist in the wild.
When a PDF page contains text, OCRmyPDF assumes that the page has already
been OCRed or is a "born digital" page that should not be OCRed. The default
behavior is to exit in this case without producing a file. You can use the
option --skip-text to ignore pages with text, or --force-ocr to rasterize
all objects on the page and produce an image-only PDF as output.
ocrmypdf --skip-text file_with_some_text_pages.pdf output.pdf
ocrmypdf --force-ocr word_document.pdf output.pdf
If you are concerned about long-term archiving of PDFs, use the default option
--output-type pdfa which converts the PDF to a standardized PDF/A-2b. This
removes some features from the PDF such as Javascript or forms. If you want to
minimize the number of changes made to your PDF, use --output-type pdf.
If OCRmyPDF is given an image file as input, it will attempt to convert the
image to a PDF before processing. For more control over the conversion of
images to PDF, use the Python package img2pdf or other image to PDF software.
For example, this command uses img2pdf to convert all .png files beginning
with the 'page' prefix to a PDF, fitting each image on A4-sized paper, and
sending the result to OCRmyPDF through a pipe.
img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf
Online documentation is located at:
https://ocrmypdf.readthedocs.io/en/latest/introduction.html
""",
)
parser.add_argument(
'input_file',
metavar="input_pdf_or_image",
help="PDF file containing the images to be OCRed (or '-' to read from "
"standard input)",
)
parser.add_argument(
'output_file',
metavar="output_pdf",
help="Output searchable PDF file (or '-' to write to standard output). "
"Existing files will be ovewritten. If same as input file, the "
"input file will be updated only if processing is successful.",
)
parser.add_argument(
'-l',
'--language',
dest='languages',
action=LanguageSetAction,
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
"all language packs installed in your system). Use -l eng+deu for "
"multiple languages.",
)
parser.add_argument(
'--image-dpi',
metavar='DPI',
type=int,
help="For input image instead of PDF, use this DPI instead of file's.",
)
parser.add_argument(
'--output-type',
choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
default='pdfa',
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
"long term archiving (default, recommended) but may not suitable "
"for users who want their file altered as little as possible. 'pdfa' "
"also has problems with full Unicode text. 'pdf' attempts to "
"preserve file contents as much as possible. 'pdf-a1' creates a "
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
"PDF/A3-b file.",
)
# Use null string '\0' as sentinel to indicate the user supplied no argument,
# since that is the only invalid character for filepaths on all platforms
# bool('\0') is True in Python
parser.add_argument(
'--sidecar',
nargs='?',
const='\0',
default=None,
metavar='FILE',
help="Generate sidecar text files that contain the same text recognized "
"by Tesseract. This may be useful for building a OCR text database. "
"If FILE is omitted, the sidecar file be named {output_file}.txt; the next "
"argument must NOT be the name of the input PDF. "
"If FILE is set to '-', the sidecar is written to stdout (a "
"convenient way to preview OCR quality). The output file and sidecar "
"may not both use stdout at the same time.",
)
parser.add_argument(
'--version',
action='version',
version=_VERSION,
help="Print program version and exit",
)
jobcontrol = parser.add_argument_group("Job control options")
jobcontrol.add_argument(
'-j',
'--jobs',
metavar='N',
type=numeric(int, 0, 256),
help="Use up to N CPU cores simultaneously (default: use all).",
)
jobcontrol.add_argument(
'-q', '--quiet', action='store_true', help="Suppress INFO messages"
)
jobcontrol.add_argument(
'-v',
'--verbose',
type=numeric(int, 0, 2),
default=0,
const=1,
nargs='?',
help="Print more verbose messages for each additional verbose level. Use "
"`-v 1` typically for much more detailed logging. Higher numbers "
"are probably only useful in debugging.",
)
jobcontrol.add_argument(
'--no-progress-bar',
action='store_false',
dest='progress_bar',
help=argparse.SUPPRESS,
)
jobcontrol.add_argument(
'--use-threads', action='store_true', help=argparse.SUPPRESS
)
metadata = parser.add_argument_group(
"Metadata options",
"Set output PDF/A metadata (default: copy input document's metadata)",
)
metadata.add_argument(
'--title', type=str, help="Set document title (place multiple words in quotes)"
)
metadata.add_argument('--author', type=str, help="Set document author")
metadata.add_argument(
'--subject', type=str, help="Set document subject description"
)
metadata.add_argument('--keywords', type=str, help="Set document keywords")
preprocessing = parser.add_argument_group(
"Image preprocessing options",
"Options to improve the quality of the final PDF and OCR",
)
preprocessing.add_argument(
'-r',
'--rotate-pages',
action='store_true',
help="Automatically rotate pages based on detected text orientation",
)
preprocessing.add_argument(
'--remove-background',
action='store_true',
help="Attempt to remove background from gray or color pages, setting it "
"to white ",
)
preprocessing.add_argument(
'-d',
'--deskew',
action='store_true',
help="Deskew each page before performing OCR",
)
preprocessing.add_argument(
'-c',
'--clean',
action='store_true',
help="Clean pages from scanning artifacts before performing OCR, and send "
"the cleaned page to OCR, but do not include the cleaned page in "
"the output",
)
preprocessing.add_argument(
'-i',
'--clean-final',
action='store_true',
help="Clean page as above, and incorporate the cleaned image in the final "
"PDF. Might remove desired content.",
)
preprocessing.add_argument(
'--unpaper-args',
type=str,
default=None,
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
"Example: --unpaper-args '--layout double'.",
)
preprocessing.add_argument(
'--oversample',
metavar='DPI',
type=numeric(int, 0, 5000),
default=0,
help="Oversample images to at least the specified DPI, to improve OCR "
"results slightly",
)
preprocessing.add_argument(
'--remove-vectors',
action='store_true',
help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
"will not be included in OCR. This can eliminate false characters.",
)
preprocessing.add_argument(
'--threshold',
action='store_true',
help=(
"EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract "
"for OCR. Can improve OCR quality compared to Tesseract's thresholder."
),
)
ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
ocrsettings.add_argument(
'-f',
'--force-ocr',
action='store_true',
help="Rasterize any text or vector objects on each page, apply OCR, and "
"save the rastered output (this rewrites the PDF)",
)
ocrsettings.add_argument(
'-s',
'--skip-text',
action='store_true',
help="Skip OCR on any pages that already contain text, but include the "
"page in final output; useful for PDFs that contain a mix of "
"images, text pages, and/or previously OCRed pages",
)
ocrsettings.add_argument(
'--redo-ocr',
action='store_true',
help="Attempt to detect and remove the hidden OCR layer from files that "
"were previously OCRed with OCRmyPDF or another program. Apply OCR "
"to text found in raster images. Existing visible text objects will "
"not be changed. If there is no existing OCR, OCR will be added.",
)
ocrsettings.add_argument(
'--skip-big',
type=numeric(float, 0, 5000),
metavar='MPixels',
help="Skip OCR on pages larger than the specified amount of megapixels, "
"but include skipped pages in final output",
)
optimizing = parser.add_argument_group(
"Optimization options", "Control how the PDF is optimized after OCR"
)
optimizing.add_argument(
'-O',
'--optimize',
type=int,
choices=range(0, 4),
default=1,
help=(
"Control how PDF is optimized after processing:"
"0 - do not optimize; "
"1 - do safe, lossless optimizations (default); "
"2 - do some lossy optimizations; "
"3 - do aggressive lossy optimizations (including lossy JBIG2)"
),
)
optimizing.add_argument(
'--jpeg-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
help=(
"Adjust JPEG quality level for JPEG optimization. "
"100 is best quality and largest output size; "
"1 is lowest quality and smallest output; "
"0 uses the default."
),
)
optimizing.add_argument(
'--jpg-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
dest='jpeg_quality',
help=argparse.SUPPRESS, # Alias for --jpeg-quality
)
optimizing.add_argument(
'--png-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
help=(
"Adjust PNG quality level to use when quantizing PNGs. "
"Values have same meaning as with --jpeg-quality"
),
)
optimizing.add_argument(
'--jbig2-lossy',
action='store_true',
help=(
"Enable JBIG2 lossy mode (better compression, not suitable for some "
"use cases - see documentation)."
),
)
optimizing.add_argument(
'--jbig2-page-group-size',
type=numeric(int, 1, 10000),
default=0,
metavar='N',
# Adjust number of pages to consider at once for JBIG2 compression
help=argparse.SUPPRESS,
)
advanced = parser.add_argument_group(
"Advanced", "Advanced options to control OCRmyPDF"
)
advanced.add_argument(
'--pages',
type=str,
help=(
"Limit OCR to the specified pages (ranges or comma separated), "
"skipping others"
),
)
advanced.add_argument(
'--max-image-mpixels',
action='store',
type=numeric(float, 0),
metavar='MPixels',
help="Set maximum number of pixels to unpack before treating an image as a "
"decompression bomb",
default=128.0,
)
advanced.add_argument(
'--pdf-renderer',
choices=['auto', 'hocr', 'sandwich', 'hocrdebug'],
default='auto',
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
"choose. See documentation for discussion.",
)
advanced.add_argument(
'--rotate-pages-threshold',
default=14.0,
type=numeric(float, 0, 1000),
metavar='CONFIDENCE',
help="Only rotate pages when confidence is above this value (arbitrary "
"units reported by tesseract)",
)
advanced.add_argument(
'--pdfa-image-compression',
choices=['auto', 'jpeg', 'lossless'],
default='auto',
help="Specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.",
)
advanced.add_argument(
'--fast-web-view',
type=numeric(float, 0),
default=1.0,
metavar="MEGABYTES",
help="If the size of file is more than this threshold (in MB), then "
"linearize the PDF for fast web viewing. This allows the PDF to be "
"displayed before it is fully downloaded in web browsers, but increases "
"the space required slightly. By default we skip this for small files "
"which do not benefit. If the threshold is 0 it will be apply to all files. "
"Set the threshold very high to disable.",
)
advanced.add_argument(
'--plugin',
dest='plugins',
action='append',
default=[],
help="Name of plugin to import. Argument may be issued multiple times to "
"import multiple plugins. Plugins may be specified as module names in "
"Python syntax, provided they are installed in the same Python (virtual) "
"environment as ocrmypdf; or you may give the path to the Python file that "
"contains the plugin. Plugins must conform to the specification in the "
"OCRmyPDF documentation.",
)
debugging = parser.add_argument_group(
"Debugging", "Arguments to help with troubleshooting and debugging"
)
debugging.add_argument(
'-k',
'--keep-temporary-files',
action='store_true',
help="Keep temporary files (helpful for debugging)",
)
return parser
plugins_only_parser = ArgumentParser(
prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False
)
plugins_only_parser.add_argument(
'--plugin',
dest='plugins',
action='append',
default=[],
help="Name of plugin to import.",
)

View File

@@ -1,19 +1,8 @@
# © 2016 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from enum import IntEnum

View File

@@ -1,173 +0,0 @@
# © 2016 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
"""Wrappers to manage subprocess calls"""
import os
import re
import sys
from subprocess import run, STDOUT, PIPE, CalledProcessError
from ..exceptions import MissingDependencyError, ExitCode
from collections.abc import Mapping
def get_version(program, *, version_arg='--version', regex=r'(\d+(\.\d+)*)'):
"Get the version of the specified program"
args_prog = [program, version_arg]
try:
proc = run(
args_prog,
close_fds=True,
universal_newlines=True,
stdout=PIPE,
stderr=STDOUT,
check=True,
)
output = proc.stdout
except FileNotFoundError as e:
raise MissingDependencyError(
f"Could not find program '{program}' on the PATH"
) from e
except CalledProcessError as e:
if e.returncode != 0:
raise MissingDependencyError(
f"Ran program '{program}' but it exited with an error:\n{e.output}"
) from e
raise MissingDependencyError(
f"Could not find program '{program}' on the PATH"
) from e
try:
version = re.match(regex, output.strip()).group(1)
except AttributeError as e:
raise MissingDependencyError(
f"The program '{program}' did not report its version. "
f"Message was:\n{output}"
)
return version
missing_program = '''
The program '{program}' could not be executed or was not found on your
system PATH.
'''
missing_optional_program = '''
The program '{program}' could not be executed or was not found on your
system PATH. This program is required when you use the
{required_for} arguments. You could try omitting these arguments, or install
the package.
'''
missing_recommend_program = '''
The program '{program}' could not be executed or was not found on your
system PATH. This program is recommended when using the {required_for} arguments,
but not required, so we will proceed. For best results, install the program.
'''
old_version = '''
OCRmyPDF requires '{program}' {need_version} or higher. Your system appears
to have {found_version}. Please update this program.
'''
old_version_required_for = '''
OCRmyPDF requires '{program}' {need_version} or higher when run with the
{required_for} arguments. If you omit these arguments, OCRmyPDF may be able to
proceed. For best results, install the program.
'''
osx_install_advice = '''
If you have homebrew installed, try these command to install the missing
package:
brew install {package}
'''
linux_install_advice = '''
On systems with the aptitude package manager (Debian, Ubuntu), try these
commands:
sudo apt-get update
sudo apt-get install {package}
On RPM-based systems (Red Hat, Fedora), search for instructions on
installing the RPM for {program}.
'''
def _get_platform():
if sys.platform.startswith('freebsd'):
return 'freebsd'
elif sys.platform.startswith('linux'):
return 'linux'
return sys.platform
def _error_trailer(log, program, package, **kwargs):
if isinstance(package, Mapping):
package = package[_get_platform()]
if _get_platform() == 'darwin':
log.info(osx_install_advice.format(**locals()))
elif _get_platform() == 'linux':
log.info(linux_install_advice.format(**locals()))
def _error_missing_program(log, program, package, required_for, recommended):
if required_for:
log.error(missing_optional_program.format(**locals()))
elif recommended:
log.info(missing_recommend_program.format(**locals()))
else:
log.error(missing_program.format(**locals()))
_error_trailer(**locals())
def _error_old_version(
log, program, package, need_version, found_version, required_for
):
if required_for:
log.error(old_version_required_for.format(**locals()))
else:
log.error(old_version.format(**locals()))
_error_trailer(**locals())
def check_external_program(
*,
log,
program,
package,
version_checker,
need_version,
required_for=None,
recommended=False,
):
try:
found_version = version_checker()
except (CalledProcessError, FileNotFoundError, MissingDependencyError):
_error_missing_program(log, program, package, required_for, recommended)
if not recommended:
sys.exit(ExitCode.missing_dependency)
return
if found_version < need_version:
_error_old_version(
log, program, package, need_version, found_version, required_for
)
if not recommended:
sys.exit(ExitCode.missing_dependency)
log.debug(f'Found {program} {found_version}')

View File

@@ -1,291 +0,0 @@
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import re
from functools import lru_cache
from os import fspath
from shutil import copy
from subprocess import PIPE, STDOUT, run
from tempfile import NamedTemporaryFile
from PIL import Image
from . import get_version
from ..exceptions import SubprocessOutputError
@lru_cache(maxsize=1)
def version():
return get_version('gs')
def jpeg_passthrough_available():
"""Returns True if the installed version of Ghostscript supports JPEG passthru
Prior to 9.23, Ghostscript decode and re-encoded JPEGs internally. In 9.23
it gained the ability to keep JPEGs unmodified. However, the 9.23
implementation was buggy and would deletes the last two bytes of images in
some cases, as reported here.
https://bugs.ghostscript.com/show_bug.cgi?id=699216
The issue was fixed for 9.24, hence that is the first version we consider
the feature available. (However, we don't use 9.24 at all, so the first
version that allows JPEG passthrough is 9.25.
"""
return version() >= '9.24'
def _gs_error_reported(stream):
return re.search(r'error', stream, flags=re.IGNORECASE)
def extract_text(input_file, pageno=1):
"""Use the txtwrite device to get text layout information out
For details on options of -dTextFormat see
https://www.ghostscript.com/doc/current/VectorDevices.htm#TXT
Format is like
<page>
<line>
<span bbox="left top right bottom" font="..." size="...">
<char bbox="...." c="X"/>
:param pageno: number of page to extract, or all pages if None
:return: XML-ish text representation in bytes
"""
if pageno is not None:
pages = ['-dFirstPage=%i' % pageno, '-dLastPage=%i' % pageno]
else:
pages = []
args_gs = (
[
'gs',
'-dQUIET',
'-dSAFER',
'-dBATCH',
'-dNOPAUSE',
'-sDEVICE=txtwrite',
'-dTextFormat=0',
]
+ pages
+ ['-o', '-', fspath(input_file)]
)
p = run(args_gs, stdout=PIPE, stderr=PIPE)
if p.returncode != 0:
raise SubprocessOutputError(
'Ghostscript text extraction failed\n%s\n%s\n%s'
% (input_file, p.stdout.decode(), p.stderr.decode())
)
return p.stdout
def rasterize_pdf(
input_file,
output_file,
xres,
yres,
raster_device,
log,
pageno=1,
page_dpi=None,
rotation=None,
filter_vector=False,
):
"""Rasterize one page of a PDF at resolution (xres, yres) in canvas units.
The image is sized to match the integer pixels dimensions implied by
(xres, yres) even if those numbers are noninteger. The image's DPI will
be overridden with the values in page_dpi.
:param input_file: pathlike
:param output_file: pathlike
:param xres: resolution at which to rasterize page
:param yres:
:param raster_device:
:param log:
:param pageno: page number to rasterize (beginning at page 1)
:param page_dpi: resolution tuple (x, y) overriding output image DPI
:param rotation: 0, 90, 180, 270: clockwise angle to rotate page
:param filter_vector: if True, remove vector graphics objects
:return:
"""
res = round(xres, 6), round(yres, 6)
if not page_dpi:
page_dpi = res
with NamedTemporaryFile(delete=True) as tmp:
args_gs = (
[
'gs',
'-dQUIET',
'-dSAFER',
'-dBATCH',
'-dNOPAUSE',
f'-sDEVICE={raster_device}',
f'-dFirstPage={pageno}',
f'-dLastPage={pageno}',
f'-r{res[0]:f}x{res[1]:f}',
]
+ (['-dFILTERVECTOR'] if filter_vector else [])
+ [
'-o',
tmp.name,
'-dAutoRotatePages=/None', # Probably has no effect on raster
'-f',
fspath(input_file),
]
)
log.debug(args_gs)
p = run(args_gs, stdout=PIPE, stderr=STDOUT, universal_newlines=True)
if _gs_error_reported(p.stdout):
log.error(p.stdout)
else:
log.debug(p.stdout)
if p.returncode != 0:
log.error('Ghostscript rasterizing failed')
raise SubprocessOutputError()
tmp.seek(0)
with Image.open(tmp) as im:
if rotation is not None:
log.debug("Rotating output by %i", rotation)
# rotation is a clockwise angle and Image.ROTATE_* is
# counterclockwise so this cancels out the rotation
if rotation == 90:
im = im.transpose(Image.ROTATE_90)
elif rotation == 180:
im = im.transpose(Image.ROTATE_180)
elif rotation == 270:
im = im.transpose(Image.ROTATE_270)
if rotation % 180 == 90:
page_dpi = page_dpi[1], page_dpi[0]
im.save(fspath(output_file), dpi=page_dpi)
def generate_pdfa(
pdf_pages,
output_file,
compression,
log,
threads=1,
pdf_version='1.5',
pdfa_part='2',
):
"""Generate a PDF/A.
The pdf_pages, a list files, will be merged into output_file. One or more
PDF files may be merged. One of the files in this list must be a pdfmark
file that provides Ghostscript with details on how to perform the PDF/A
conversion. By default with we pick PDF/A-2b, but this works for 1 or 3.
compression can be 'jpeg', 'lossless', or an empty string. In 'jpeg',
Ghostscript is instructed to convert color and grayscale images to DCT
(JPEG encoding). In 'lossless' Ghostscript is told to convert images to
Flate (lossless/PNG). If the parameter is omitted Ghostscript is left to
make its own decisions about how to encode images; it appears to use a
heuristic to decide how to encode images. As of Ghostscript 9.25, we
support passthrough JPEG which allows Ghostscript to avoid transcoding
images entirely. (The feature was added in 9.23 but broken, and the 9.24
release of Ghostscript had regressions, so we don't support it until 9.25.)
"""
compression_args = []
if compression == 'jpeg':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/DCTEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/DCTEncode",
]
elif compression == 'lossless':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/FlateEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/FlateEncode",
]
else:
compression_args = [
"-dAutoFilterColorImages=true",
"-dAutoFilterGrayImages=true",
]
# Older versions of Ghostscript expect a leading slash in
# sColorConversionStrategy, newer ones should not have it. See Ghostscript
# git commit fe1c025d.
strategy = 'RGB' if version() >= '9.19' else '/RGB'
if version() == '9.23':
# 9.23: new feature JPEG passthrough is broken in some cases, best to
# disable it always
# https://bugs.ghostscript.com/show_bug.cgi?id=699216
compression_args.append('-dPassThroughJPEGImages=false')
with NamedTemporaryFile(delete=True) as gs_pdf:
# nb no need to specify ProcessColorModel when ColorConversionStrategy
# is set; see:
# https://bugs.ghostscript.com/show_bug.cgi?id=699392
args_gs = (
[
"gs",
"-dQUIET",
"-dBATCH",
"-dNOPAUSE",
"-dCompatibilityLevel=" + str(pdf_version),
"-sDEVICE=pdfwrite",
"-dAutoRotatePages=/None",
"-sColorConversionStrategy=" + strategy,
]
+ compression_args
+ [
"-dJPEGQ=95",
"-dPDFA=" + pdfa_part,
"-dPDFACompatibilityPolicy=1",
"-sOutputFile=" + gs_pdf.name,
]
)
args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs
log.debug(args_gs)
p = run(args_gs, stdout=PIPE, stderr=STDOUT, universal_newlines=True)
if _gs_error_reported(p.stdout):
log.error(p.stdout)
elif 'overprint mode not set' in p.stdout:
# Unless someone is going to print PDF/A documents on a
# magical sRGB printer I can't see the removal of overprinting
# being a problem....
log.debug(
"Ghostscript had to remove PDF 'overprinting' from the "
"input file to complete PDF/A conversion. "
)
else:
log.debug(p.stdout)
if p.returncode == 0:
# Ghostscript does not change return code when it fails to create
# PDF/A - check PDF/A status elsewhere
copy(gs_pdf.name, fspath(output_file))
else:
log.error('Ghostscript PDF/A rendering failed')
raise SubprocessOutputError()

View File

@@ -1,70 +0,0 @@
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from functools import lru_cache
from subprocess import run
from tempfile import NamedTemporaryFile
from PIL import Image
from . import get_version
from ..exceptions import MissingDependencyError
@lru_cache(maxsize=1)
def version():
return get_version('pngquant', regex=r'(\d+(\.\d+)*).*')
def available():
try:
version()
except MissingDependencyError:
return False
return True
def quantize(input_file, output_file, quality_min, quality_max):
if input_file.endswith('.jpg'):
im = Image.open(input_file)
with NamedTemporaryFile(suffix='.png') as tmp:
im.save(tmp)
args = [
'pngquant',
'--force',
'--skip-if-larger',
'--output',
output_file,
'--quality',
f'{quality_min}-{quality_max}',
'--',
tmp.name,
]
run(args)
else:
args = [
'pngquant',
'--force',
'--skip-if-larger',
'--output',
output_file,
'--quality',
f'{quality_min}-{quality_max}',
'--',
input_file,
]
run(args)

View File

@@ -1,49 +0,0 @@
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from functools import lru_cache
from os import fspath
from subprocess import PIPE, STDOUT, CalledProcessError, run
from . import get_version
@lru_cache(maxsize=1)
def version():
return get_version('qpdf', regex=r'qpdf version (.+)')
def check(input_file, log=None):
args_qpdf = ['qpdf', '--check', fspath(input_file)]
if log is None:
import logging as log
try:
run(args_qpdf, stderr=STDOUT, stdout=PIPE, universal_newlines=True, check=True)
except CalledProcessError as e:
if e.returncode == 2:
log.error("%s: not a valid PDF, and could not repair it.", input_file)
log.error("Details:")
log.error(e.output)
elif e.returncode == 3:
log.info("qpdf --check returned warnings:")
log.info(e.output)
else:
log.warning(e.output)
return False
return True

View File

@@ -1,361 +0,0 @@
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import os
import shutil
import sys
from collections import namedtuple
from contextlib import suppress
from functools import lru_cache
from os import fspath
from subprocess import (
PIPE,
STDOUT,
CalledProcessError,
TimeoutExpired,
check_output,
run,
)
from textwrap import dedent
from . import get_version
from ..exceptions import MissingDependencyError, TesseractConfigError
from ..helpers import page_number
OrientationConfidence = namedtuple('OrientationConfidence', ('angle', 'confidence'))
HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 4.0.0' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "_blank.png"; bbox 0 0 {0} {1}; ppageno 0'>
</div>
</body>
</html>
"""
@lru_cache(maxsize=1)
def version():
return get_version('tesseract', regex=r'tesseract\s(.+)')
def v4():
"Is this Tesseract v4.0?"
return version() >= '4'
@lru_cache(maxsize=1)
def has_textonly_pdf():
"""Does Tesseract have textonly_pdf capability?
Available in v4.00.00alpha since January 2017. Best to
parse the parameter list
"""
args_tess = ['tesseract', '--print-parameters', 'pdf']
params = ''
try:
params = check_output(args_tess, universal_newlines=True, stderr=STDOUT)
except CalledProcessError as e:
print("Could not --print-parameters from tesseract", file=sys.stderr)
raise MissingDependencyError from e
if 'textonly_pdf' in params:
return True
return False
@lru_cache(maxsize=1)
def languages():
def lang_error(output):
msg = dedent(
"""Tesseract failed to report available languages.
Output from Tesseract:
-----------
"""
)
msg += output
print(msg, file=sys.stderr)
args_tess = ['tesseract', '--list-langs']
try:
proc = run(
args_tess, universal_newlines=True, stdout=PIPE, stderr=STDOUT, check=True
)
output = proc.stdout
except CalledProcessError as e:
lang_error(e.output)
raise MissingDependencyError from e
header, *rest = output.splitlines()
if not header.startswith('List of available languages'):
lang_error(output)
raise MissingDependencyError
return set(lang.strip() for lang in rest)
def tess_base_args(langs, engine_mode):
args = ['tesseract']
if langs:
args.extend(['-l', '+'.join(langs)])
if engine_mode is not None and v4():
args.extend(['--oem', str(engine_mode)])
return args
def get_orientation(input_file, engine_mode, timeout: float, log):
args_tesseract = tess_base_args(['osd'], engine_mode) + [
'--psm',
'0',
fspath(input_file),
'stdout',
]
try:
stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout)
except TimeoutExpired:
return OrientationConfidence(angle=0, confidence=0.0)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file)
if (
b'Too few characters. Skipping this page' in e.output
or b'Image too large' in e.output
):
return OrientationConfidence(0, 0)
raise e from e
else:
osd = {}
for line in stdout.decode().splitlines():
line = line.strip()
parts = line.split(':', maxsplit=2)
if len(parts) == 2:
osd[parts[0].strip()] = parts[1].strip()
angle = int(osd.get('Orientation in degrees', 0))
oc = OrientationConfidence(
angle=angle, confidence=float(osd.get('Orientation confidence', 0))
)
return oc
def tesseract_log_output(log, stdout, input_file):
prefix = f"{(page_number(input_file)):4d}: [tesseract] "
try:
text = stdout.decode()
except UnicodeDecodeError:
log.error(
prefix
+ "command line output was not utf-8. "
+ "This usually means Tesseract's language packs do not match "
"the installed version of Tesseract."
)
text = stdout.decode('utf-8', 'backslashreplace')
lines = text.splitlines()
for line in lines:
if line.startswith("Tesseract Open Source"):
continue
elif line.startswith("Warning in pixReadMem"):
continue
elif 'diacritics' in line:
log.warning(prefix + "lots of diacritics - possibly poor OCR")
elif line.startswith('OSD: Weak margin'):
log.warning(prefix + "unsure about page orientation")
elif 'Error in pixScanForForeground' in line:
pass # Appears to be spurious/problem with nonwhite borders
elif 'Error in boxClipToRectangle' in line:
pass # Always appears with pixScanForForeground message
elif 'parameter not found: ' in line.lower():
log.error(prefix + line.strip())
problem = line.split('found: ')[1]
raise TesseractConfigError(problem)
elif 'error' in line.lower() or 'exception' in line.lower():
log.error(prefix + line.strip())
elif 'warning' in line.lower():
log.warning(prefix + line.strip())
elif 'read_params_file' in line.lower():
log.error(prefix + line.strip())
else:
log.info(prefix + line.strip())
def page_timedout(log, input_file, timeout):
if timeout == 0:
return
prefix = f"{(page_number(input_file)):4d}: [tesseract] "
log.warning(prefix + " took too long to OCR - skipping")
def _generate_null_hocr(output_hocr, output_sidecar, image):
"""Produce a .hocr file that reports no text detected on a page that is
the same size as the input image."""
from PIL import Image
im = Image.open(image)
w, h = im.size
with open(output_hocr, 'w', encoding="utf-8") as f:
f.write(HOCR_TEMPLATE.format(w, h))
with open(output_sidecar, 'w', encoding='utf-8') as f:
f.write('[skipped page]')
def generate_hocr(
input_file,
output_files,
language: list,
engine_mode,
tessconfig: list,
timeout: float,
pagesegmode: int,
user_words,
user_patterns,
log,
):
output_hocr = next(o for o in output_files if o.endswith('.hocr'))
output_sidecar = next(o for o in output_files if o.endswith('.txt'))
prefix = os.path.splitext(output_hocr)[0]
args_tesseract = tess_base_args(language, engine_mode)
if pagesegmode is not None:
args_tesseract.extend(['--psm', str(pagesegmode)])
if user_words:
args_tesseract.extend(['--user-words', user_words])
if user_patterns:
args_tesseract.extend(['--user-patterns', user_patterns])
# Reminder: test suite tesseract spoofers will break after any changes
# to the number of order parameters here
args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig)
try:
log.debug(args_tesseract)
stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout)
except TimeoutExpired:
# Generate a HOCR file with no recognized text if tesseract times out
# Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file.
page_timedout(log, input_file, timeout)
_generate_null_hocr(output_hocr, output_sidecar, input_file)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file)
if b'Image too large' in e.output:
_generate_null_hocr(output_hocr, output_sidecar, input_file)
return
raise e from e
else:
tesseract_log_output(log, stdout, input_file)
# The sidecar text file will get the suffix .txt; rename it to
# whatever caller wants it named
if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_sidecar)
def use_skip_page(text_only, skip_pdf, output_pdf, output_text):
with open(output_text, 'w') as f:
f.write('[skipped page]')
if skip_pdf and not text_only:
# Substitute a "skipped page"
with suppress(FileNotFoundError):
os.remove(output_pdf) # In case it was partially created
os.symlink(skip_pdf, output_pdf)
return
# Or normally, just write a 0 byte file to the output to indicate a skip
with open(output_pdf, 'wb') as out:
out.write(b'')
def generate_pdf(
*,
input_image,
skip_pdf=None,
output_pdf,
output_text,
language: list,
engine_mode,
text_only: bool,
tessconfig: list,
timeout: float,
pagesegmode: int,
user_words,
user_patterns,
log,
):
'''Use Tesseract to render a PDF.
input_image -- image to analyze
skip_pdf -- if we time out, use this file as output
output_pdf -- file to generate
output_text -- OCR text file
language -- list of languages to consider
engine_mode -- engine mode argument for tess v4
text_only -- enable tesseract text only mode?
tessconfig -- tesseract configuration
timeout -- timeout (seconds)
log -- logger object
'''
args_tesseract = tess_base_args(language, engine_mode)
if pagesegmode is not None:
args_tesseract.extend(['--psm', str(pagesegmode)])
if text_only and has_textonly_pdf():
args_tesseract.extend(['-c', 'textonly_pdf=1'])
if user_words:
args_tesseract.extend(['--user-words', user_words])
if user_patterns:
args_tesseract.extend(['--user-patterns', user_patterns])
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
# Reminder: test suite tesseract spoofers might break after any changes
# to the number of order parameters here
args_tesseract.extend([input_image, prefix, 'pdf', 'txt'] + tessconfig)
try:
log.debug(args_tesseract)
stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout)
if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_text)
except TimeoutExpired:
page_timedout(log, input_image, timeout)
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_image)
if b'Image too large' in e.output:
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
return
raise e from e
else:
tesseract_log_output(log, stdout, input_image)

View File

@@ -1,129 +0,0 @@
# © 2015 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# unpaper documentation:
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
import os
import shlex
import subprocess
import sys
from functools import lru_cache
from subprocess import PIPE, STDOUT, CalledProcessError
from tempfile import TemporaryDirectory
from . import get_version
from ..exceptions import MissingDependencyError, SubprocessOutputError
try:
from PIL import Image
except ImportError:
print("Could not find Python3 imaging library", file=sys.stderr)
raise
@lru_cache(maxsize=1)
def version():
return get_version('unpaper')
def run(input_file, output_file, dpi, log, mode_args):
args_unpaper = ['unpaper', '-v', '--dpi', str(dpi)] + mode_args
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
im = Image.open(input_file)
if im.mode not in SUFFIXES.keys():
log.info("Converting image to other colorspace")
try:
if im.mode == 'P' and len(im.getcolors()) == 2:
im = im.convert(mode='1')
else:
im = im.convert(mode='RGB')
except IOError as e:
log.error("Could not convert image with type " + im.mode)
im.close()
raise MissingDependencyError() from e
try:
suffix = SUFFIXES[im.mode]
except KeyError:
log.error("Failed to convert image to a supported format.")
im.close()
raise MissingDependencyError() from e
with TemporaryDirectory() as tmpdir:
input_pnm = os.path.join(tmpdir, f'input{suffix}')
output_pnm = os.path.join(tmpdir, f'output{suffix}')
im.save(input_pnm, format='PPM')
im.close()
# To prevent any shenanigans from accepting arbitrary parameters in
# --unpaper-args, we:
# 1) run with cwd set to a tmpdir with only unpaper's files
# 2) forbid the use of '/' in arguments, to prevent changing paths
# 3) append absolute paths for the input and output file
# This should ensure that a user cannot clobber some other file with
# their unpaper arguments (whether intentionally or otherwise)
args_unpaper.extend([input_pnm, output_pnm])
try:
proc = subprocess.run(
args_unpaper,
check=True,
close_fds=True,
universal_newlines=True,
stderr=STDOUT,
cwd=tmpdir,
stdout=PIPE,
)
except CalledProcessError as e:
log.debug(e.output)
raise e from e
else:
log.debug(proc.stdout)
# unpaper sets dpi to 72; fix this
try:
Image.open(output_pnm).save(output_file, dpi=(dpi, dpi))
except (FileNotFoundError, OSError):
raise SubprocessOutputError(
"unpaper: failed to produce the expected output file. Called with: "
+ str(args_unpaper)
) from None
def validate_custom_args(args: str):
unpaper_args = shlex.split(args)
if any('/' in arg for arg in unpaper_args):
raise ValueError('No filenames allowed in --unpaper-args')
return unpaper_args
def clean(input_file, output_file, dpi, log, unpaper_args=None):
default_args = [
'--layout',
'none',
'--mask-scan-size',
'100', # don't blank out narrow columns
'--no-border-align', # don't align visible content to borders
'--no-mask-center', # don't center visible content within page
'--no-grayfilter', # don't remove light gray areas
'--no-blackfilter', # don't remove solid black areas
'--no-deskew', # don't deskew
]
if not unpaper_args:
unpaper_args = default_args
run(input_file, output_file, dpi, log, unpaper_args)

View File

@@ -0,0 +1,191 @@
# © 2021 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Semaphore-free alternate executor.
There are two popular environments that do not fully support the standard Python
multiprocessing module: AWS Lambda, and Termux (a terminal emulator for Android).
This alternate executor divvies up work among worker processes before processing,
rather than having each worker consume work from a shared queue when they finish
their task. This means workers have no need to coordinate with each other. Each
worker communicates only with the main process.
This is not without drawbacks. If the tasks are not "even" in size, which cannot
be guaranteed, some workers may end up with too much work while others are idle.
It is less efficient than the standard implementation, so not th edefault.
"""
import logging
import logging.handlers
import signal
from contextlib import suppress
from enum import Enum, auto
from itertools import islice, repeat, takewhile, zip_longest
from multiprocessing import Pipe, Process
from multiprocessing.connection import Connection, wait
from typing import Callable, Iterable, Iterator
from ocrmypdf import Executor, hookimpl
from ocrmypdf._concurrent import NullProgressBar
from ocrmypdf.exceptions import InputFileError
from ocrmypdf.helpers import remove_all_log_handlers
class MessageType(Enum):
exception = auto()
result = auto()
complete = auto()
def split_every(n: int, iterable: Iterable) -> Iterator:
"""Split iterable into groups of n.
>>> list(split_every(4, range(10)))
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
https://stackoverflow.com/a/22919323
"""
iterator = iter(iterable)
return takewhile(bool, (list(islice(iterator, n)) for _ in repeat(None)))
def process_sigbus(*args):
raise InputFileError("A worker process lost access to an input file")
class ConnectionLogHandler(logging.handlers.QueueHandler):
def __init__(self, conn: Connection) -> None:
super().__init__(None)
self.conn = conn
def enqueue(self, record):
self.conn.send(('log', record))
def process_loop(
conn: Connection, user_init: Callable[[], None], loglevel, task, task_args
):
"""Initialize a process pool worker"""
# Install SIGBUS handler (so our parent process can abort somewhat gracefully)
with suppress(AttributeError): # Windows and Cygwin do not have SIGBUS
# Windows and Cygwin do not have pthread_sigmask or SIGBUS
signal.signal(signal.SIGBUS, process_sigbus)
# Reconfigure the root logger for this process to send all messages to a queue
h = ConnectionLogHandler(conn)
root = logging.getLogger()
remove_all_log_handlers(root)
root.setLevel(loglevel)
root.addHandler(h)
user_init()
for args in task_args:
try:
result = task(args)
except Exception as e:
conn.send((MessageType.exception, e))
break
else:
conn.send((MessageType.result, result))
conn.send((MessageType.complete, None))
conn.close()
return
class LambdaExecutor(Executor):
def _execute(
self,
*,
use_threads: bool,
max_workers: int,
tqdm_kwargs: dict,
worker_initializer: Callable,
task: Callable,
task_arguments: Iterable,
task_finished: Callable,
):
if use_threads and max_workers == 1:
with self.pbar_class(**tqdm_kwargs) as pbar:
for args in task_arguments:
result = task(args)
task_finished(result, pbar)
return
task_arguments = list(task_arguments)
grouped_args = list(
zip_longest(*list(split_every(max_workers, task_arguments)))
)
if not grouped_args:
return
processes = []
connections = []
for chunk in grouped_args:
parent_conn, child_conn = Pipe()
worker_args = [args for args in chunk if args is not None]
process = Process(
target=process_loop,
args=(
child_conn,
worker_initializer,
logging.getLogger("").level,
task,
worker_args,
),
)
process.daemon = True
processes.append(process)
connections.append(parent_conn)
for process in processes:
process.start()
with self.pbar_class(**tqdm_kwargs) as pbar:
while connections:
for r in wait(connections):
try:
msg_type, msg = r.recv()
except EOFError:
connections.remove(r)
continue
if msg_type == MessageType.result:
if task_finished:
task_finished(msg, pbar)
elif msg_type == 'log':
record = msg
logger = logging.getLogger(record.name)
logger.handle(record)
elif msg_type == MessageType.complete:
connections.remove(r)
elif msg_type == MessageType.exception:
for process in processes:
process.terminate()
raise msg
for process in processes:
process.join()
@hookimpl
def get_executor(progressbar_class):
return LambdaExecutor(pbar_class=progressbar_class)
@hookimpl
def get_logging_console():
return logging.StreamHandler()
@hookimpl
def get_progressbar_class():
return NullProgressBar

View File

@@ -1,46 +1,88 @@
# © 2016 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import logging
import multiprocessing
import os
import sys
import shutil
import warnings
from collections import namedtuple
from collections.abc import Iterable
from contextlib import suppress
from functools import partial, wraps
from functools import wraps
from io import StringIO
from math import isclose, isfinite
from pathlib import Path
from typing import Any, Sequence
import pikepdf
log = logging.getLogger(__name__)
def re_symlink(input_file, soft_link_name, log=None):
"""
Helper function: relinks soft symbolic link if necessary
class Resolution(namedtuple('Resolution', ('x', 'y'))):
"""The number of pixels per inch in each 2D direction."""
__slots__ = ()
def round(self, ndigits: int):
return Resolution(round(self.x, ndigits), round(self.y, ndigits))
def to_int(self):
return Resolution(int(round(self.x)), int(round(self.y)))
@property
def is_square(self) -> bool:
return isclose(self.x, self.y, rel_tol=1e-3)
@property
def is_finite(self) -> bool:
return isfinite(self.x) and isfinite(self.y)
def take_max(self, vals, yvals=None):
if yvals is not None:
return Resolution(max(self.x, *vals), max(self.y, *yvals))
max_x, max_y = self.x, self.y
for x, y in vals:
max_x = max(x, max_x)
max_y = max(y, max_y)
return Resolution(max_x, max_y)
def flip_axis(self):
return Resolution(self.y, self.x)
def __str__(self):
return f"{self.x:f}x{self.y:f}"
def __repr__(self): # pragma: no cover
return f"Resolution({self.x}x{self.y} dpi)"
class NeverRaise(Exception):
"""An exception that is never raised"""
def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
"""Create a symbolic link at ``soft_link_name``, which references ``input_file``.
Think of this as copying ``input_file`` to ``soft_link_name`` with less overhead.
Use symlinks safely. Self-linking loops are prevented. On Windows, file copy is
used since symlinks may require administrator privileges. An existing link at the
destination is removed.
"""
input_file = os.fspath(input_file)
soft_link_name = os.fspath(soft_link_name)
if log is None:
prdebug = partial(print, file=sys.stderr)
else:
prdebug = log.debug
# Guard against soft linking to oneself
if input_file == soft_link_name:
prdebug(
"Warning: No symbolic link made. You are using "
+ "the original data directory as the working directory."
log.warning(
"No symbolic link created. You are using the original data directory "
"as the working directory."
)
return
@@ -48,90 +90,165 @@ def re_symlink(input_file, soft_link_name, log=None):
if os.path.lexists(soft_link_name):
# do not delete or overwrite real (non-soft link) file
if not os.path.islink(soft_link_name):
raise FileExistsError("%s exists and is not a link" % soft_link_name)
try:
os.unlink(soft_link_name)
except OSError:
prdebug("Can't unlink %s" % (soft_link_name))
raise FileExistsError(f"{soft_link_name} exists and is not a link")
os.unlink(soft_link_name)
if not os.path.exists(input_file):
raise FileNotFoundError("trying to create a broken symlink to %s" % input_file)
raise FileNotFoundError(f"trying to create a broken symlink to {input_file}")
prdebug("os.symlink(%s, %s)" % (input_file, soft_link_name))
if os.name == 'nt':
# Don't actually use symlinks on Windows due to permission issues
shutil.copyfile(input_file, soft_link_name)
return
log.debug("os.symlink(%s, %s)", input_file, soft_link_name)
# Create symbolic link using absolute path
os.symlink(os.path.abspath(input_file), soft_link_name)
def is_iterable_notstr(thing):
def samefile(f1: os.PathLike, f2: os.PathLike):
if os.name == 'nt':
return f1 == f2
else:
return os.path.samefile(f1, f2)
def is_iterable_notstr(thing: Any) -> bool:
"""Is this is an iterable type, other than a string?"""
return isinstance(thing, Iterable) and not isinstance(thing, str)
def page_number(input_file):
def monotonic(L: Sequence) -> bool:
"""Does this sequence increase monotonically?"""
return all(b > a for a, b in zip(L, L[1:]))
def page_number(input_file: os.PathLike) -> int:
"""Get one-based page number implied by filename (000002.pdf -> 2)"""
return int(os.path.basename(os.fspath(input_file))[0:6])
def available_cpu_count():
def available_cpu_count() -> int:
"""Returns number of CPUs in the system."""
try:
return multiprocessing.cpu_count()
except NotImplementedError:
pass
try:
import psutil
return psutil.cpu_count()
except (ImportError, AttributeError):
pass
warnings.warn(
"Could not get CPU count. Assuming one (1) CPU." "Use -j N to set manually."
)
return 1
def is_file_writable(test_file):
def is_file_writable(test_file: os.PathLike) -> bool:
"""Intentionally racy test if target is writable.
We intend to write to the output file if and only if we succeed and
can replace it atomically. Before doing the OCR work, make sure
the location is writable.
"""
p = Path(test_file)
try:
p = Path(test_file)
if p.is_symlink():
p = p.resolve(strict=False)
if p.is_symlink():
p = p.resolve(strict=False)
# p.is_file() throws an exception in some cases
if p.exists() and p.is_file():
return os.access(
os.fspath(p),
os.W_OK,
effective_ids=(os.access in os.supports_effective_ids),
)
else:
try:
fp = p.open('wb')
except OSError:
return False
else:
fp.close()
with suppress(OSError):
p.unlink()
return True
except (EnvironmentError, RuntimeError) as e:
log.debug(e)
log.error(str(e))
return False
# p.is_file() throws an exception in some cases
if p.exists() and p.is_file():
return os.access(
os.fspath(p),
os.W_OK,
effective_ids=(os.access in os.supports_effective_ids),
)
def check_pdf(input_file: Path) -> bool:
"""Check if a PDF complies with the PDF specification.
Checks for proper formatting and proper linearization. Uses pikepdf (which in
turn, uses QPDF) to perform the checks.
"""
try:
pdf = pikepdf.open(input_file)
except pikepdf.PdfError as e:
log.error(e)
return False
else:
try:
fp = p.open('wb')
except OSError:
with pdf:
messages = pdf.check()
for msg in messages:
if 'error' in msg.lower():
log.error(msg)
else:
log.warning(msg)
sio = StringIO()
linearize_msgs = ''
try:
# If linearization is missing entirely, we do not complain. We do
# complain if linearization is present but incorrect.
pdf.check_linearization(sio)
except RuntimeError:
pass
except (
# Workaround for a problematic pikepdf version
# pragma: no cover
getattr(pikepdf, 'ForeignObjectError')
if pikepdf.__version__ == '2.1.0'
else NeverRaise
):
pass
else:
linearize_msgs = sio.getvalue()
if linearize_msgs:
log.warning(linearize_msgs)
if not messages and not linearize_msgs:
return True
return False
else:
fp.close()
with suppress(OSError):
p.unlink()
return True
def flatten_groups(groups):
for obj in groups:
if is_iterable_notstr(obj):
yield from obj
else:
yield obj
def clamp(n, smallest, largest): # mypy doesn't understand types for this
"""Clamps the value of ``n`` to between ``smallest`` and ``largest``."""
return max(smallest, min(n, largest))
def remove_all_log_handlers(logger):
"Remove all log handlers, usually used in a child process."
for handler in logger.handlers[:]:
logger.removeHandler(handler)
handler.close() # To ensure handlers with opened resources are released
def pikepdf_enable_mmap():
# try:
# if pikepdf._qpdf.set_access_default_mmap(True):
# log.debug("pikepdf mmap enabled")
# except AttributeError:
# log.debug("pikepdf mmap not available")
# We found a race condition probably related to pybind issue #2252 that can
# cause a crash. For now, disable pikepdf mmap to be on the safe side.
# Fix is not in pybind11 2.6.0
# log.debug("pikepdf mmap disabled")
return
def deprecated(func):
"""Warn that function is deprecated"""
"""Warn that function is deprecated."""
@wraps(func)
def new_func(*args, **kwargs):

View File

@@ -29,15 +29,28 @@
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import argparse
import os
import re
from collections import namedtuple
from itertools import chain
from math import atan, cos, sin
from pathlib import Path
from typing import Any, NamedTuple, Optional, Tuple, Union
from xml.etree import ElementTree
from reportlab.lib.colors import black, cyan, magenta, red
from reportlab.lib.units import inch
from reportlab.pdfgen.canvas import Canvas
Rect = namedtuple('Rect', ['x1', 'y1', 'x2', 'y2'])
Element = ElementTree.Element
class Rect(NamedTuple): # pylint: disable=inherit-non-class
"""A rectangle for managing PDF coordinates."""
x1: Any
y1: Any
x2: Any
y2: Any
class HocrTransformError(Exception):
@@ -64,9 +77,9 @@ class HocrTransform:
{'': 'ff', '': 'ffi', '': 'ffl', '': 'fi', '': 'fl'}
)
def __init__(self, hocrFileName, dpi):
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float):
self.dpi = dpi
self.hocr = ElementTree.parse(hocrFileName)
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
# if the hOCR file has a namespace, ElementTree requires its use to
# find elements
@@ -77,7 +90,7 @@ class HocrTransform:
# get dimension in pt (not pixel!!!!) of the OCRed image
self.width, self.height = None, None
for div in self.hocr.findall(".//%sdiv[@class='ocr_page']" % (self.xmlns)):
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')):
coords = self.element_coordinates(div)
pt_coords = self.pt_from_pixel(coords)
self.width = pt_coords.x2 - pt_coords.x1
@@ -88,38 +101,38 @@ class HocrTransform:
if self.width is None or self.height is None:
raise HocrTransformError("hocr file is missing page dimensions")
def __str__(self):
def __str__(self): # pragma: no cover
"""
Return the textual content of the HTML body
"""
if self.hocr is None:
return ''
body = self.hocr.find(".//%sbody" % (self.xmlns))
body = self.hocr.find(self._child_xpath('body'))
if body:
return self._get_element_text(body)
else:
return ''
def _get_element_text(self, element):
def _get_element_text(self, element: Element):
"""
Return the textual content of the element and its children
"""
text = ''
if element.text is not None:
text += element.text
for child in element.getchildren():
for child in element:
text += self._get_element_text(child)
if element.tail is not None:
text += element.tail
return text
@classmethod
def element_coordinates(cls, element):
def element_coordinates(cls, element: Element) -> Rect:
"""
Returns a tuple containing the coordinates of the bounding box around
an element
"""
out = (0, 0, 0, 0)
out = Rect._make(0 for _ in range(4))
if 'title' in element.attrib:
matches = cls.box_pattern.search(element.attrib['title'])
if matches:
@@ -128,7 +141,7 @@ class HocrTransform:
return out
@classmethod
def baseline(cls, element):
def baseline(cls, element: Element) -> Tuple[float, float]:
"""
Returns a tuple containing the baseline slope and intercept.
"""
@@ -136,32 +149,47 @@ class HocrTransform:
matches = cls.baseline_pattern.search(element.attrib['title'])
if matches:
return float(matches.group(1)), int(matches.group(2))
return (0, 0)
return (0.0, 0.0)
def pt_from_pixel(self, pxl):
def pt_from_pixel(self, pxl) -> Rect:
"""
Returns the quantity in PDF units (pt) given quantity in pixels
"""
return Rect._make((c / self.dpi * inch) for c in pxl)
def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str:
xpath = f".//{self.xmlns}{html_tag}"
if html_class:
xpath += f"[@class='{html_class}']"
return xpath
@classmethod
def replace_unsupported_chars(cls, s):
def replace_unsupported_chars(cls, s: str) -> str:
"""
Given an input string, returns the corresponding string that:
- is available in the helvetica facetype
- does not contain any ligature (to allow easy search in the PDF file)
* is available in the Helvetica facetype
* does not contain any ligature (to allow easy search in the PDF file)
"""
return s.translate(cls.ligatures)
def topdown_position(self, element):
pxl_line_coords = self.element_coordinates(element)
line_box = self.pt_from_pixel(pxl_line_coords)
# Coordinates here are still in the hocr coordinate system, so 0 on the y axis
# is the top of the page and increasing values of y will move towards the
# bottom of the page.
return line_box.y2
def to_pdf(
self,
outFileName,
imageFileName=None,
showBoundingboxes=False,
fontname="Helvetica",
invisibleText=False,
interwordSpaces=False,
):
*,
out_filename: Path,
image_filename: Optional[Path] = None,
show_bounding_boxes: bool = False,
fontname: str = "Helvetica",
invisible_text: bool = False,
interword_spaces: bool = False,
) -> None:
"""
Creates a PDF file with an image superimposed on top of the text.
Text is positioned according to the bounding box of the lines in
@@ -169,19 +197,36 @@ class HocrTransform:
The image need not be identical to the image used to create the hOCR
file.
It can have a lower resolution, different color mode, etc.
Arguments:
out_filename: Path of PDF to write.
image_filename: Image to use for this file. If omitted, the OCR text
is shown.
show_bounding_boxes: Show bounding boxes around various text regions,
for debugging.
fontname: Name of font to use.
invisible_text: If True, text is rendered invisible so that is
selectable but never drawn. If False, text is visible and may
be seen if the image is skipped or deleted in Acrobat.
interword_spaces: If True, insert spaces between words rather than
drawing each word without spaces. Generally this improves text
extraction.
"""
# create the PDF file
# page size in points (1/72 in.)
pdf = Canvas(outFileName, pagesize=(self.width, self.height), pageCompression=1)
pdf = Canvas(
os.fspath(out_filename),
pagesize=(self.width, self.height),
pageCompression=1,
)
# draw bounding box for each paragraph
# light blue for bounding box of paragraph
pdf.setStrokeColorRGB(0, 1, 1)
pdf.setStrokeColor(cyan)
# light blue for bounding box of paragraph
pdf.setFillColorRGB(0, 1, 1)
pdf.setFillColor(cyan)
pdf.setLineWidth(0) # no line for bounding box
for elem in self.hocr.findall(".//%sp[@class='%s']" % (self.xmlns, "ocr_par")):
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
elemtxt = self._get_element_text(elem).rstrip()
if len(elemtxt) == 0:
continue
@@ -190,14 +235,19 @@ class HocrTransform:
pt = self.pt_from_pixel(pxl_coords)
# draw the bbox border
if showBoundingboxes:
if show_bounding_boxes: # pragma: no cover
pdf.rect(
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1
)
found_lines = False
for line in self.hocr.findall(
".//%sspan[@class='%s']" % (self.xmlns, "ocr_line")
for line in sorted(
chain(
self.hocr.iterfind(self._child_xpath('span', 'ocr_header')),
self.hocr.iterfind(self._child_xpath('span', 'ocr_line')),
self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')),
),
key=self.topdown_position,
):
found_lines = True
self._do_line(
@@ -205,45 +255,49 @@ class HocrTransform:
line,
"ocrx_word",
fontname,
invisibleText,
interwordSpaces,
showBoundingboxes,
invisible_text,
interword_spaces,
show_bounding_boxes,
)
if not found_lines:
# Tesseract did not report any lines (just words)
root = self.hocr.find(".//%sdiv[@class='%s']" % (self.xmlns, "ocr_page"))
root = self.hocr.find(self._child_xpath('div', 'ocr_page'))
self._do_line(
pdf,
root,
"ocrx_word",
fontname,
invisibleText,
interwordSpaces,
showBoundingboxes,
invisible_text,
interword_spaces,
show_bounding_boxes,
)
# put the image on the page, scaled to fill the page
if imageFileName is not None:
pdf.drawImage(imageFileName, 0, 0, width=self.width, height=self.height)
if image_filename is not None:
pdf.drawImage(
os.fspath(image_filename), 0, 0, width=self.width, height=self.height
)
# finish up the page and save it
pdf.showPage()
pdf.save()
@classmethod
def polyval(cls, poly, x):
def polyval(cls, poly, x): # pragma: no cover
return x * poly[0] + poly[1]
def _do_line(
self,
pdf,
line,
elemclass,
fontname,
invisibleText,
interwordSpaces,
showBoundingboxes,
pdf: Canvas,
line: Optional[Element],
elemclass: str,
fontname: str,
invisible_text: bool,
interword_spaces: bool,
show_bounding_boxes: bool,
):
if not line:
return
pxl_line_coords = self.element_coordinates(line)
line_box = self.pt_from_pixel(pxl_line_coords)
line_height = line_box.y2 - line_box.y1
@@ -262,17 +316,17 @@ class HocrTransform:
# on a sloped baseline and the edge of the bounding box.
fontsize = (line_height - abs(intercept)) / cos_a
text.setFont(fontname, fontsize)
if invisibleText:
if invisible_text:
text.setTextRenderMode(3) # Invisible (indicates OCR text)
# Intercept is normally negative, so this places it above the bottom
# of the line box
baseline_y2 = self.height - (line_box.y2 + intercept)
if showBoundingboxes:
if show_bounding_boxes: # pragma: no cover
# draw the baseline in magenta, dashed
pdf.setDash()
pdf.setStrokeColorRGB(0.95, 0.65, 0.95)
pdf.setStrokeColor(magenta)
pdf.setLineWidth(0.5)
# negate slope because it is defined as a rise/run in pixel
# coordinates and page coordinates have the y axis flipped
@@ -284,12 +338,12 @@ class HocrTransform:
)
# light green for bounding box of word/line
pdf.setDash(6, 3)
pdf.setStrokeColorRGB(1, 0, 0)
pdf.setStrokeColor(red)
text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2)
pdf.setFillColorRGB(0, 0, 0) # text in black
pdf.setFillColor(black) # text in black
elements = line.findall(".//%sspan[@class='%s']" % (self.xmlns, elemclass))
elements = line.findall(self._child_xpath('span', elemclass))
for elem in elements:
elemtxt = self._get_element_text(elem).strip()
elemtxt = self.replace_unsupported_chars(elemtxt)
@@ -298,7 +352,7 @@ class HocrTransform:
pxl_coords = self.element_coordinates(elem)
box = self.pt_from_pixel(pxl_coords)
if interwordSpaces:
if interword_spaces:
# if `--interword-spaces` is true, append a space
# to the end of each text element to allow simpler PDF viewers
# such as PDF.js to better recognize words in search and copy
@@ -318,7 +372,7 @@ class HocrTransform:
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
# draw the bbox border
if showBoundingboxes:
if show_bounding_boxes: # pragma: no cover
pdf.rect(
box.x1, self.height - line_box.y2, box_width, line_height, fill=0
)
@@ -380,10 +434,10 @@ if __name__ == "__main__":
parser.add_argument('outputfile', help='Path to the PDF file to be generated')
args = parser.parse_args()
hocr = HocrTransform(args.hocrfile, args.resolution)
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution)
hocr.to_pdf(
args.outputfile,
args.image,
args.boundingboxes,
interwordSpaces=args.interword_spaces,
out_filename=args.outputfile,
image_filename=args.image,
show_bounding_boxes=args.boundingboxes,
interword_spaces=args.interword_spaces,
)

View File

@@ -3,20 +3,10 @@
#
# © 2013-16: jbarlow83 from Github (https://github.com/jbarlow83)
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Python FFI wrapper for Leptonica library
@@ -24,30 +14,83 @@ import argparse
import logging
import os
import sys
import warnings
import threading
from collections import deque
from collections.abc import Sequence
from contextlib import suppress
from ctypes.util import find_library
from functools import lru_cache
from io import BytesIO
from io import BytesIO, UnsupportedOperation
from os import fspath
from tempfile import TemporaryFile
from warnings import warn
from .lib._leptonica import ffi
from ocrmypdf.exceptions import MissingDependencyError
from ocrmypdf.lib._leptonica import ffi
# pylint: disable=protected-access
logger = logging.getLogger(__name__)
lept = ffi.dlopen(find_library('lept'))
lept.setMsgSeverity(lept.L_SEVERITY_WARNING)
if os.name == 'nt':
from ocrmypdf.subprocess._windows import shim_env_path
libname = 'liblept-5'
os.environ['PATH'] = shim_env_path()
else:
libname = 'lept'
_libpath = find_library(libname)
if not _libpath:
raise MissingDependencyError(
"""
---------------------------------------------------------------------
This error normally occurs when ocrmypdf can't find the Leptonica
library, which is usually installed with Tesseract OCR. It could be that
Tesseract is not installed properly, we can't find the installation
on your system PATH environment variable.
The library we are looking for is usually called:
liblept-5.dll (Windows)
liblept*.dylib (macOS)
liblept*.so (Linux/BSD)
Please review our installation procedures to find a solution:
https://ocrmypdf.readthedocs.io/en/latest/installation.html
---------------------------------------------------------------------
"""
)
if os.name == 'nt':
# On Windows, recent versions of libpng require zlib. We have to make sure
# the zlib version being loaded is the same one that libpng was built with.
# This tries to import zlib from Tesseract's installation folder, falling back
# to find_library() if liblept is being loaded from somewhere else.
# Loading zlib from other places could cause a version mismatch
_zlib_path = os.path.join(os.path.dirname(_libpath), 'zlib1.dll')
if not os.path.exists(_zlib_path):
_zlib_path = find_library('zlib')
try:
zlib = ffi.dlopen(_zlib_path)
except ffi.error as e:
raise MissingDependencyError(
"""
Could not load the zlib library. It could be that Tesseract is not installed properly,
we can't find the installation on your system PATH environment variable.
"""
) from e
try:
lept = ffi.dlopen(_libpath)
lept.setMsgSeverity(lept.L_SEVERITY_WARNING)
except ffi.error as e:
raise MissingDependencyError(
f"Leptonica library found at {_libpath}, but we could not access it"
) from e
class _LeptonicaErrorTrap:
class _LeptonicaErrorTrap_Redirect:
"""
Context manager to trap errors reported by Leptonica.
Context manager to trap errors reported by Leptonica < 1.79 or on Apple Silicon.
Leptonica's error return codes don't provide much informatino about what
Leptonica's error return codes don't provide much information about what
went wrong. Leptonica does, however, write more detailed errors to stderr
(provided this is not disabled at compile time). The Leptonica source
code is very consistent in its use of macros to generate errors.
@@ -58,20 +101,23 @@ class _LeptonicaErrorTrap:
"""
leptonica_lock = threading.Lock()
def __init__(self):
self.tmpfile = None
self.copy_of_stderr = -1
self.no_stderr = False
def __enter__(self):
from io import UnsupportedOperation
self.tmpfile = TemporaryFile()
# Save the old stderr, and redirect stderr to temporary file
with suppress(AttributeError):
sys.stderr.flush()
self.leptonica_lock.acquire()
try:
# It would make sense to do sys.stderr.flush() here, but that can deadlock
# due to https://bugs.python.org/issue6721. So don't flush. Pretend
# there's nothing important in sys.stderr. If the user cared they would
# be using Leptonica 1.79 or later anyway to avoid this mess.
self.copy_of_stderr = os.dup(sys.stderr.fileno())
os.dup2(self.tmpfile.fileno(), sys.stderr.fileno(), inheritable=False)
except AttributeError:
@@ -83,7 +129,10 @@ class _LeptonicaErrorTrap:
os.dup2(self.tmpfile.fileno(), 2, inheritable=False)
except UnsupportedOperation:
self.copy_of_stderr = None
return
except Exception:
self.leptonica_lock.release()
raise
return self
def __exit__(self, exc_type, exc_value, traceback):
# Restore old stderr
@@ -100,6 +149,8 @@ class _LeptonicaErrorTrap:
self.tmpfile.seek(0) # Cursor will be at end, so move back to beginning
leptonica_output = self.tmpfile.read().decode(errors='replace')
self.tmpfile.close()
self.leptonica_lock.release()
# If there are Python errors, record them
if exc_type:
logger.warning(leptonica_output)
@@ -117,6 +168,70 @@ class _LeptonicaErrorTrap:
return False
tls = threading.local()
tls.trap = None
class _LeptonicaErrorTrap_Queue:
def __init__(self):
self.queue = deque()
def __enter__(self):
self.queue.clear()
tls.trap = self.queue
def __exit__(self, exc_type, exc_value, traceback):
tls.trap = None
output = ''.join(self.queue)
self.queue.clear()
# If there are Python errors, record them
if exc_type:
logger.warning(output)
if 'Error' in output:
if 'image file not found' in output:
raise FileNotFoundError()
elif 'pixWrite: stream not opened' in output:
raise LeptonicaIOError()
elif 'index not valid' in output:
raise IndexError()
elif 'pixGetInvBackgroundMap: w and h must be >= 5' in output:
logger.warning(
"Leptonica attempted to remove background from a low resolution - "
"you may want to review in a PDF viewer"
)
else:
raise LeptonicaError(output)
return False
try:
@ffi.callback("void(char *)")
def _stderr_handler(cstr):
msg = ffi.string(cstr).decode(errors='replace')
if msg.startswith("Error"):
logger.error(msg)
elif msg.startswith("Warning"):
logger.warning(msg)
else:
logger.debug(msg)
if tls.trap is not None:
tls.trap.append(msg)
return
lept.leptSetStderrHandler(_stderr_handler)
except (ffi.error, MemoryError):
# Pre-1.79 Leptonica does not have leptSetStderrHandler
# And some platforms, notably Apple ARM 64, do not allow the write+execute
# memory needed to set up the callback function.
_LeptonicaErrorTrap = _LeptonicaErrorTrap_Redirect
else:
# 1.79 have this new symbol
_LeptonicaErrorTrap = _LeptonicaErrorTrap_Queue
class LeptonicaError(Exception):
pass
@@ -282,7 +397,7 @@ class Pix(LeptonicaObject):
@classmethod
def read(cls, path):
warnings.warn('Use Pix.open() instead', DeprecationWarning)
warn('Use Pix.open() instead', DeprecationWarning)
return cls.open(path)
@classmethod
@@ -292,9 +407,11 @@ class Pix(LeptonicaObject):
Leptonica can load TIFF, PNM (PBM, PGM, PPM), PNG, and JPEG. If
loading fails then the object will wrap a C null pointer.
"""
filename = fspath(path)
with _LeptonicaErrorTrap():
return cls(lept.pixRead(os.fsencode(filename)))
with open(path, 'rb') as py_file:
data = py_file.read()
buffer = ffi.from_buffer(data)
with _LeptonicaErrorTrap():
return cls(lept.pixReadMem(buffer, len(buffer)))
def write_implied_format(self, path, jpeg_quality=0, jpeg_progressive=0):
"""Write pix to the filename, with the extension indicating format.
@@ -302,14 +419,22 @@ class Pix(LeptonicaObject):
jpeg_quality -- quality (iff JPEG; 1 - 100, 0 for default)
jpeg_progressive -- (iff JPEG; 0 for baseline seq., 1 for progressive)
"""
filename = fspath(path)
with _LeptonicaErrorTrap():
lept.pixWriteImpliedFormat(
os.fsencode(filename), self._cdata, jpeg_quality, jpeg_progressive
)
lept_format = lept.getImpliedFileFormat(os.fsencode(path))
with open(path, 'wb') as py_file:
data = ffi.new('l_uint8 **pdata')
size = ffi.new('size_t *psize')
with _LeptonicaErrorTrap():
if lept_format == lept.L_JPEG_ENCODE:
lept.pixWriteMemJpeg(
data, size, self._cdata, jpeg_quality, jpeg_progressive
)
else:
lept.pixWriteMem(data, size, self._cdata, lept_format)
buffer = ffi.buffer(data[0], size[0])
py_file.write(buffer)
@classmethod
def frompil(self, pillow_image):
def frompil(cls, pillow_image):
"""Create a copy of a PIL.Image from this Pix"""
bio = BytesIO()
pillow_image.save(bio, format='png', compress_level=1)
@@ -321,7 +446,7 @@ class Pix(LeptonicaObject):
def topil(self):
"""Returns a PIL.Image version of this Pix"""
from PIL import Image
from PIL import Image # pylint: disable=import-outside-toplevel
# Leptonica manages data in words, so it implicitly does an endian
# swap. Tell Pillow about this when it reads the data.
@@ -492,27 +617,15 @@ class Pix(LeptonicaObject):
)
return Pix(thresh_pix)
def crop_to_foreground(
self,
threshold=128,
mindist=70,
erasedist=30,
pagenum=0,
showmorph=0,
display=0,
pdfdir=ffi.NULL,
):
def crop_to_foreground(self, threshold=128, mindist=70, erasedist=30, showmorph=0):
if get_leptonica_version() < 'leptonica-1.76':
# Leptonica 1.76 changed the API for pixFindPageForeground; we don't
# support the old version
raise LeptonicaError("Not available in this version of Leptonica")
with _LeptonicaErrorTrap():
cropbox = Box(
lept.pixFindPageForeground(
self._cdata,
threshold,
mindist,
erasedist,
pagenum,
showmorph,
display,
pdfdir,
self._cdata, threshold, mindist, erasedist, showmorph, ffi.NULL
)
)
@@ -549,6 +662,9 @@ class Pix(LeptonicaObject):
bg_val=200,
smooth_kernel=(2, 1),
):
if self.width < tile_size[0] or self.height < tile_size[1]:
logger.info("Skipped pixMaskedThreshOnBackgroundNorm on small image")
return self
# Background norm doesn't work on color mapped Pix, so remove colormap
target_pix = self.remove_colormap(lept.REMOVE_CMAP_BASED_ON_SRC)
with _LeptonicaErrorTrap():
@@ -827,6 +943,8 @@ def get_leptonica_version():
Caveat: Leptonica expects the caller to free this memory. We don't,
since that would involve binding to libc to access libc.free(),
a pointless effort to reclaim 100 bytes of memory.
Reminder that this returns "leptonica-1.xx" or "leptonica-1.yy.0".
"""
return ffi.string(lept.getLeptonicaVersion()).decode()

View File

@@ -1,18 +1,8 @@
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""Bindings to external libraries"""

View File

File diff suppressed because one or more lines are too long

View File

@@ -1,20 +1,12 @@
#!/usr/bin/env python3
# © 2017 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
from pathlib import Path
from cffi import FFI
@@ -74,6 +66,17 @@ struct Pixa
};
typedef struct Pixa PIXA;
/*! Array of compressed pix */
struct PixaComp
{
l_int32 n; /*!< number of PixComp in ptr array */
l_int32 nalloc; /*!< number of PixComp ptrs allocated */
l_int32 offset; /*!< indexing offset into ptr array */
struct PixComp **pixc; /*!< the array of ptrs to PixComp */
struct Boxa *boxa; /*!< array of boxes */
};
typedef struct PixaComp PIXAC;
struct Box
{
l_int32 x;
@@ -210,9 +213,15 @@ ffibuilder.cdef(
"""
PIX * pixRead ( const char *filename );
PIX * pixReadMem ( const l_uint8 *data, size_t size );
PIX * pixReadStream ( FILE *fp, l_int32 hint );
PIX * pixScale ( PIX *pixs, l_float32 scalex, l_float32 scaley );
l_int32 pixFindSkew ( PIX *pixs, l_float32 *pangle, l_float32 *pconf );
l_int32 pixWriteImpliedFormat ( const char *filename, PIX *pix, l_int32 quality, l_int32 progressive );
l_int32 getImpliedFileFormat ( const char *filename );
l_ok pixWriteStream ( FILE *fp, PIX *pix, l_int32 format );
l_ok pixWriteStreamJpeg ( FILE *fp, PIX *pixs, l_int32 quality, l_int32 progressive );
l_ok pixWriteMem ( l_uint8 **pdata, size_t *psize, PIX *pix, l_int32 format );
l_ok pixWriteMemJpeg ( l_uint8 **pdata, size_t *psize, PIX *pix, l_int32 quality, l_int32 progressive );
l_int32
pixWriteMemPng(l_uint8 **pdata,
size_t *psize,
@@ -294,14 +303,12 @@ pixCleanBackgroundToWhite(PIX *pixs,
l_int32 whiteval);
BOX *
pixFindPageForeground(PIX *pixs,
l_int32 threshold,
l_int32 mindist,
l_int32 erasedist,
l_int32 pagenum,
l_int32 showmorph,
l_int32 display,
const char *pdfdir);
pixFindPageForeground ( PIX *pixs,
l_int32 threshold,
l_int32 mindist,
l_int32 erasedist,
l_int32 showmorph,
PIXAC *pixac );
PIX *
pixClipRectangle(PIX *pixs,
@@ -414,7 +421,10 @@ pixExtractBarcodes(PIX *pixs,
l_int32 debugflag);
BOXA *
pixLocateBarcodes ( PIX *pixs, l_int32 thresh, PIX **ppixb, PIX **ppixm );
pixLocateBarcodes ( PIX *pixs,
l_int32 thresh,
PIX **ppixb,
PIX **ppixm );
SARRAY *
pixReadBarcodes(PIXA *pixa,
@@ -423,6 +433,12 @@ pixReadBarcodes(PIXA *pixa,
SARRAY **psaw,
l_int32 debugflag);
PIX *
pixGenHalftoneMask(PIX *pixs,
PIX **ppixtext,
l_int32 *phtfound,
PIXA *pixadb);
l_int32
l_generateCIDataForPdf(const char *fname,
PIX *pix,
@@ -483,6 +499,8 @@ void selDestroy ( SEL **psel );
l_int32
setMsgSeverity(l_int32 newsev);
void
leptSetStderrHandler(void (*handler)(const char *));
"""
)
@@ -491,3 +509,8 @@ ffibuilder.set_source("ocrmypdf.lib._leptonica", None)
if __name__ == '__main__':
ffibuilder.compile(verbose=True)
if Path('ocrmypdf/lib/_leptonica.py').exists() and Path('src/ocrmypdf').exists():
output = Path('ocrmypdf/lib/_leptonica.py')
output.rename('src/ocrmypdf/lib/_leptonica.py')
Path('ocrmypdf/lib').rmdir()
Path('ocrmypdf').rmdir()

Some files were not shown because too many files have changed in this diff Show More