mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-02-09 05:43:10 -05:00
Merge branch 'jbarlow83:master' into master
This commit is contained in:
16
.coveragerc
16
.coveragerc
@@ -1,16 +0,0 @@
|
||||
# Coverage isn't really compatible with subprocesses so results are unreliable
|
||||
|
||||
[run]
|
||||
branch = True
|
||||
#concurrency = multiprocessing
|
||||
source = ocrmypdf/
|
||||
|
||||
[report]
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
def __repr__
|
||||
raise AssertionError
|
||||
raise NotImplementedError
|
||||
if 0:
|
||||
if False:
|
||||
if __name__ == .__main__.:
|
||||
@@ -1,16 +1,62 @@
|
||||
# OCRmyPDF
|
||||
#
|
||||
FROM ubuntu:18.04
|
||||
FROM ubuntu:20.04 as base
|
||||
|
||||
FROM base as builder
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential autoconf automake libtool \
|
||||
libleptonica-dev \
|
||||
zlib1g-dev \
|
||||
libexempi3 \
|
||||
ocrmypdf \
|
||||
python3-dev \
|
||||
python3-distutils \
|
||||
libffi-dev \
|
||||
libqpdf-dev \
|
||||
ca-certificates \
|
||||
curl \
|
||||
git
|
||||
|
||||
# Get the latest pip (Ubuntu version doesn't support manylinux2010)
|
||||
RUN \
|
||||
curl https://bootstrap.pypa.io/get-pip.py | python3
|
||||
|
||||
# Compile and install jbig2
|
||||
# Needs libleptonica-dev, zlib1g-dev
|
||||
RUN \
|
||||
mkdir jbig2 \
|
||||
&& curl -L https://github.com/agl/jbig2enc/archive/ea6a40a.tar.gz | \
|
||||
tar xz -C jbig2 --strip-components=1 \
|
||||
&& cd jbig2 \
|
||||
&& ./autogen.sh && ./configure && make && make install \
|
||||
&& cd .. \
|
||||
&& rm -rf jbig2
|
||||
|
||||
COPY . /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip3 install --no-cache-dir \
|
||||
-r requirements/main.txt \
|
||||
-r requirements/webservice.txt \
|
||||
-r requirements/test.txt \
|
||||
-r requirements/watcher.txt \
|
||||
.
|
||||
|
||||
FROM base
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ghostscript \
|
||||
img2pdf \
|
||||
liblept5 \
|
||||
libsm6 libxext6 libxrender-dev \
|
||||
zlib1g \
|
||||
pngquant \
|
||||
python3-pip \
|
||||
python3-venv \
|
||||
python3 \
|
||||
qpdf \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-chi-sim \
|
||||
tesseract-ocr-deu \
|
||||
@@ -19,54 +65,19 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
tesseract-ocr-por \
|
||||
tesseract-ocr-spa \
|
||||
unpaper \
|
||||
wget
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
COPY --from=builder /usr/local/lib/ /usr/local/lib/
|
||||
COPY --from=builder /usr/local/bin/ /usr/local/bin/
|
||||
|
||||
# Compile and install jbig2
|
||||
# Needs libleptonica-dev, zlib1g-dev
|
||||
RUN \
|
||||
mkdir jbig2 \
|
||||
&& wget -q https://github.com/agl/jbig2enc/archive/0.29.tar.gz -O - | \
|
||||
tar xz -C jbig2 --strip-components=1 \
|
||||
&& cd jbig2 \
|
||||
&& ./autogen.sh && ./configure && make && make install \
|
||||
&& cd .. \
|
||||
&& rm -rf jbig2
|
||||
COPY --from=builder /app/misc/webservice.py /app/
|
||||
COPY --from=builder /app/misc/watcher.py /app/
|
||||
|
||||
RUN apt-get remove -y autoconf automake libtool
|
||||
# Copy minimal project files to get the test suite.
|
||||
COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/
|
||||
COPY --from=builder /app/requirements /app/requirements
|
||||
COPY --from=builder /app/tests /app/tests
|
||||
|
||||
RUN python3 -m venv --system-site-packages /appenv
|
||||
|
||||
# This installs the latest binary wheel instead of the code in the current
|
||||
# folder. Installing from source will fail, apparently because cffi needs
|
||||
# build-essentials (gcc) to do a source installation
|
||||
# (i.e. "pip install ."). It's unclear to me why this is the case.
|
||||
RUN . /appenv/bin/activate; \
|
||||
pip install --upgrade pip \
|
||||
&& pip install --upgrade ocrmypdf
|
||||
|
||||
# Now copy the application in, mainly to get the test suite.
|
||||
# Do this now to make the best use of Docker cache.
|
||||
COPY . /application
|
||||
RUN . /appenv/bin/activate; \
|
||||
pip install -r /application/requirements/test.txt
|
||||
|
||||
# Remove the junk, including the source version of application since it was
|
||||
# already installed
|
||||
RUN rm -rf /tmp/* /var/tmp/* /root/* /application/ocrmypdf \
|
||||
&& apt-get remove -y build-essential \
|
||||
&& apt-get autoremove -y \
|
||||
&& apt-get autoclean -y
|
||||
|
||||
RUN useradd docker \
|
||||
&& mkdir /home/docker \
|
||||
&& chown docker:docker /home/docker
|
||||
|
||||
USER docker
|
||||
WORKDIR /home/docker
|
||||
|
||||
# Must use array form of ENTRYPOINT
|
||||
# Non-array form does not append other arguments, because that is "intuitive"
|
||||
ENTRYPOINT ["/application/.docker/docker-wrapper.sh"]
|
||||
ENTRYPOINT ["/usr/local/bin/ocrmypdf"]
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
FROM alpine:3.9 as base
|
||||
|
||||
FROM base as builder
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN \
|
||||
echo '@testing http://nl.alpinelinux.org/alpine/edge/testing' >> /etc/apk/repositories \
|
||||
# Add runtime dependencies
|
||||
&& apk add --update \
|
||||
python3-dev \
|
||||
py3-setuptools \
|
||||
jbig2enc@testing \
|
||||
ghostscript \
|
||||
qpdf \
|
||||
tesseract-ocr \
|
||||
unpaper \
|
||||
pngquant \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
zlib-dev \
|
||||
qpdf-dev \
|
||||
libffi-dev \
|
||||
leptonica-dev \
|
||||
binutils \
|
||||
# Install pybind11 for pikepdf
|
||||
&& pip3 install pybind11 \
|
||||
# Install flask for the webservice
|
||||
&& pip3 install flask \
|
||||
# Add build dependencies
|
||||
&& apk add --virtual build-dependencies \
|
||||
build-base \
|
||||
git
|
||||
|
||||
COPY . /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip3 install .
|
||||
|
||||
FROM base
|
||||
|
||||
ENV LANG=C.UTF-8
|
||||
|
||||
RUN \
|
||||
echo '@testing http://nl.alpinelinux.org/alpine/edge/testing' >> /etc/apk/repositories \
|
||||
# Add runtime dependencies
|
||||
&& apk add --update \
|
||||
python3 \
|
||||
jbig2enc@testing \
|
||||
ghostscript \
|
||||
qpdf \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-data-deu \
|
||||
tesseract-ocr-data-chi_sim \
|
||||
unpaper \
|
||||
pngquant \
|
||||
libxml2 \
|
||||
libxslt \
|
||||
zlib \
|
||||
qpdf \
|
||||
libffi \
|
||||
leptonica-dev \
|
||||
binutils \
|
||||
&& mkdir /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy build artifacts (python site-packages9
|
||||
COPY --from=builder /usr/lib/python3.6/site-packages /usr/lib/python3.6/site-packages
|
||||
COPY --from=builder /usr/bin/ocrmypdf /usr/bin/dumppdf.py /usr/bin/latin2ascii.py /usr/bin/pdf2txt.py /usr/bin/img2pdf /usr/bin/chardetect /usr/bin/
|
||||
|
||||
# Copy
|
||||
COPY --from=builder /app/.docker/webservice.py /app/
|
||||
|
||||
# Copy minimal project files to get the test suite.
|
||||
COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/
|
||||
COPY --from=builder /app/requirements /app/requirements
|
||||
COPY --from=builder /app/tests /app/tests
|
||||
COPY --from=builder /app/src /app/src
|
||||
# Copy PKG-INFO from build artifact in app dir to make setuptools-scm happy
|
||||
RUN cp /usr/lib/python3.6/site-packages/ocrmypdf-*.egg-info/PKG-INFO /app
|
||||
|
||||
ENTRYPOINT ["/usr/bin/ocrmypdf"]
|
||||
@@ -1,5 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
. /appenv/bin/activate
|
||||
cd /home/docker
|
||||
exec ocrmypdf "$@"
|
||||
@@ -1,17 +0,0 @@
|
||||
# OCRmyPDF polyglot
|
||||
#
|
||||
FROM jbarlow83/ocrmypdf:latest
|
||||
|
||||
USER root
|
||||
|
||||
# Update system and install our dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
tesseract-ocr-all
|
||||
|
||||
RUN apt-get autoremove -y && apt-get clean -y
|
||||
|
||||
USER docker
|
||||
|
||||
# Must use array form of ENTRYPOINT
|
||||
# Non-array form does not append other arguments, because that is "intuitive"
|
||||
ENTRYPOINT ["/application/.docker/docker-wrapper.sh"]
|
||||
@@ -1,24 +0,0 @@
|
||||
# OCRmyPDF webservice
|
||||
#
|
||||
FROM jbarlow83/ocrmypdf-polyglot:latest
|
||||
|
||||
USER root
|
||||
|
||||
# Update system and install our dependencies
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3-flask
|
||||
|
||||
RUN apt-get autoremove -y && apt-get clean -y
|
||||
|
||||
EXPOSE 5000
|
||||
|
||||
COPY .docker/webservice.py /application
|
||||
|
||||
USER docker
|
||||
|
||||
VOLUME ["/config"]
|
||||
|
||||
# This config file is optional
|
||||
ENV OCRMYPDF_WEBSERVICE_SETTINGS "/config/config.py"
|
||||
|
||||
ENTRYPOINT ["python3", "/application/webservice.py"]
|
||||
@@ -1,26 +1,44 @@
|
||||
# dotfiles
|
||||
.*
|
||||
!.coveragerc
|
||||
!.dockerignore
|
||||
!.git_archival.txt
|
||||
!.gitattributes
|
||||
!.gitignore
|
||||
!.pre-commit-config.yaml
|
||||
!.readthedocs.yml
|
||||
|
||||
# Dev scratch
|
||||
*.ipynb
|
||||
*.pdf
|
||||
*.pyc
|
||||
*.rst
|
||||
*.sublime*
|
||||
**/*.pyc
|
||||
.*/
|
||||
!.git/
|
||||
!.docker/
|
||||
.ruffus_history.sqlite
|
||||
bin/
|
||||
build/
|
||||
docs/
|
||||
dist/
|
||||
htmlcov/
|
||||
include/
|
||||
lib/
|
||||
MANIFEST.in
|
||||
ocrmypdf.egg-info/
|
||||
staging/
|
||||
tests/cache/
|
||||
tests/output/
|
||||
/*.pdf
|
||||
/*.qdf
|
||||
/*.png
|
||||
/scratch.py
|
||||
IDEAS
|
||||
log/
|
||||
tests/resources/private/
|
||||
tmp/
|
||||
venv*/
|
||||
/debug_tests.py
|
||||
*.traineddata
|
||||
/private
|
||||
|
||||
# Package building
|
||||
*.egg-info/
|
||||
build/
|
||||
dist/
|
||||
wheelhouse/
|
||||
pip-wheel-metadata/
|
||||
|
||||
# Code coverage
|
||||
htmlcov/
|
||||
|
||||
# Docker specific
|
||||
bin/
|
||||
docs/
|
||||
include/
|
||||
lib/
|
||||
|
||||
# Docker include .git/
|
||||
!.git/
|
||||
|
||||
1
.gitattributes
vendored
1
.gitattributes
vendored
@@ -9,5 +9,6 @@
|
||||
*.png binary
|
||||
*.jpg binary
|
||||
*.bin binary
|
||||
*.afdesign binary
|
||||
|
||||
.git_archival.txt export-subst
|
||||
|
||||
12
.github/FUNDING.yml
vendored
Normal file
12
.github/FUNDING.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
|
||||
patreon: # Replace with a single Patreon username
|
||||
open_collective: james-barlow
|
||||
ko_fi: # Replace with a single Ko-fi username
|
||||
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
||||
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
||||
liberapay: # Replace with a single Liberapay username
|
||||
issuehunt: # Replace with a single IssueHunt username
|
||||
otechie: # Replace with a single Otechie username
|
||||
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
||||
32
.github/ISSUE_TEMPLATE/1-general-issues.md
vendored
Normal file
32
.github/ISSUE_TEMPLATE/1-general-issues.md
vendored
Normal file
@@ -0,0 +1,32 @@
|
||||
---
|
||||
name: General issues
|
||||
about: Installation, packages, dependencies, "nothing works", test suite failures...
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
What's the problem?
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the behavior.
|
||||
|
||||
**Expected behavior**
|
||||
What did you expected to happen?
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots to help explain your problem.
|
||||
|
||||
**System (please complete the following information):**
|
||||
- OS:
|
||||
- Python version:
|
||||
- OCRmyPDF version:
|
||||
|
||||
**Installation**
|
||||
How did you install OCRmyPDF? Did you install it from your operating system's
|
||||
package manager, or using pip?
|
||||
|
||||
**Additional context**
|
||||
Add any other context about the problem here.
|
||||
40
.github/ISSUE_TEMPLATE/2-problem-with-a-specific-input-file.md
vendored
Normal file
40
.github/ISSUE_TEMPLATE/2-problem-with-a-specific-input-file.md
vendored
Normal file
@@ -0,0 +1,40 @@
|
||||
---
|
||||
name: Problem with a specific input file
|
||||
about: Something went wrong while trying to OCR a specific file
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
A clear and concise description of what the bug is.
|
||||
|
||||
**To Reproduce**
|
||||
What command line or API call were you trying to run?
|
||||
|
||||
```bash
|
||||
ocrmypdf ...arguments... input.pdf output.pdf
|
||||
```
|
||||
|
||||
Run with verbosity or higher `-v1` to see more detailed logging. This information may be helpful.
|
||||
|
||||
**Example file**
|
||||
If your issue is a problem that affects only certain files, and we will require an input file (PDF or image) that demonstrates your issue.
|
||||
|
||||
Please provide an input file with no personal or confidential information. At your option you may [GPG-encrypt the file](https://github.com/jbarlow83/OCRmyPDF/wiki) for OCRmyPDF's author only.
|
||||
|
||||
Links to files hosted elsewhere are perfectly acceptable. You could also look in ``tests/resources`` and see if any of those files reproduce your issue.
|
||||
|
||||
*(Issues without example files usually cannot be resolved. It's like reporting an issue against a web browser without providing a URL.)*
|
||||
|
||||
**Expected behavior**
|
||||
A clear and concise description of what you expected to happen.
|
||||
|
||||
**Screenshots**
|
||||
If applicable, add screenshots to help explain your problem.
|
||||
|
||||
**System**
|
||||
- OS: [e.g. Linux, Windows, macOS]
|
||||
- OCRmyPDF Version: ``ocrmypdf --version``
|
||||
- How did you install ocrmypdf? Did you use a system package manager, `pip`, or a Docker image?
|
||||
20
.github/ISSUE_TEMPLATE/3-feature_request.md
vendored
Normal file
20
.github/ISSUE_TEMPLATE/3-feature_request.md
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
---
|
||||
name: Feature request
|
||||
about: Suggest an idea for this project
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Is your feature request related to a problem? Please describe.**
|
||||
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
|
||||
|
||||
**Describe the solution you'd like**
|
||||
A clear and concise description of what you want to happen.
|
||||
|
||||
**Describe alternatives you've considered**
|
||||
A clear and concise description of any alternative solutions or features you've considered.
|
||||
|
||||
**Additional context**
|
||||
Add any other context or screenshots about the feature request here.
|
||||
33
.github/issue_template.md
vendored
33
.github/issue_template.md
vendored
@@ -1,33 +0,0 @@
|
||||
**Describe the issue**
|
||||
A clear and concise description of what the issue is.
|
||||
|
||||
**To Reproduce**
|
||||
What command line were you trying to run?
|
||||
|
||||
```bash
|
||||
ocrmypdf ...arguments... input.pdf output.pdf
|
||||
```
|
||||
|
||||
**Example file**
|
||||
Please include an example *input* PDF (or image). The input file is more helpful.
|
||||
|
||||
Please check any or all that apply about the test file:
|
||||
|
||||
- [ ] This is the input file
|
||||
- [ ] The file contains no personal or confidential information
|
||||
- [ ] I am the copyright holder for this file
|
||||
- [ ] I permit this file to be included in the OCRmyPDF test suite under the CC-BY-SA 4.0 license
|
||||
- [ ] I am not the copyright holder, but this file is available under a free software license
|
||||
|
||||
Files that are not free for inclusion in this project are quite welcome, but we like to collect free files for our test suite when possible. Please do *not* submit files with confidential information. At your option you may encrypt files for OCRmyPDF's author only.
|
||||
|
||||
**Expected behavior**
|
||||
A clear and concise description of what you expected to happen. Include screenshots if applicable.
|
||||
|
||||
**System:**
|
||||
|
||||
- OS: [e.g. Linux, macOS]
|
||||
- OCRmyPDF Version: [e.g. v7.4.0]
|
||||
|
||||
**Additional context**
|
||||
Add any other context about the problem here.
|
||||
275
.github/workflows/build.yml
vendored
Normal file
275
.github/workflows/build.yml
vendored
Normal file
@@ -0,0 +1,275 @@
|
||||
name: Test and deploy
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- ci
|
||||
- release/*
|
||||
tags:
|
||||
- v*
|
||||
paths-ignore:
|
||||
- README*
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
test_linux:
|
||||
name: Test ${{ matrix.os }} with Python ${{ matrix.python }}
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-18.04] #, ubuntu-20.04]
|
||||
python: ["3.6"] #, "3.7", "3.8", "3.9"]
|
||||
|
||||
env:
|
||||
OS: ${{ matrix.os }}
|
||||
PYTHON: ${{ matrix.python }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
name: Install Python
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
|
||||
- name: Install common packages
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
curl \
|
||||
ghostscript \
|
||||
img2pdf \
|
||||
libffi-dev \
|
||||
liblept5 \
|
||||
libsm6 libxext6 libxrender-dev \
|
||||
pngquant \
|
||||
poppler-utils \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-deu \
|
||||
tesseract-ocr-eng \
|
||||
unpaper \
|
||||
zlib1g
|
||||
|
||||
- name: Install Ubuntu 18.04 packages
|
||||
if: matrix.os == 'ubuntu-18.04'
|
||||
run: |
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
libexempi3
|
||||
|
||||
- name: Install Ubuntu 20.04 packages
|
||||
if: matrix.os == 'ubuntu-20.04'
|
||||
run: |
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
libexempi8
|
||||
|
||||
- name: Install Python packages
|
||||
run: |
|
||||
python -m pip install -r requirements/main.txt -r requirements/test.txt .
|
||||
|
||||
- name: Report versions
|
||||
run: |
|
||||
tesseract --version
|
||||
gs --version
|
||||
pngquant --version
|
||||
unpaper --version
|
||||
img2pdf --version
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
python -m pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v1
|
||||
with:
|
||||
files: ./coverage.xml
|
||||
env_vars: OS,PYTHON
|
||||
|
||||
test_macos:
|
||||
name: Test macOS
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [macos-latest]
|
||||
python: ["3.9"]
|
||||
|
||||
env:
|
||||
OS: ${{ matrix.os }}
|
||||
PYTHON: ${{ matrix.python }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
name: Install Python
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
|
||||
- name: Install Homebrew deps
|
||||
run: |
|
||||
brew update
|
||||
brew install \
|
||||
exempi \
|
||||
ghostscript \
|
||||
jbig2enc \
|
||||
leptonica \
|
||||
openjpeg \
|
||||
pngquant \
|
||||
tesseract
|
||||
|
||||
- name: Install Python packages
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -r requirements/main.txt -r requirements/test.txt .
|
||||
|
||||
- name: Report versions
|
||||
run: |
|
||||
tesseract --version
|
||||
gs --version
|
||||
pngquant --version
|
||||
img2pdf --version
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
python -m pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v1
|
||||
with:
|
||||
files: ./coverage.xml
|
||||
env_vars: OS,PYTHON
|
||||
|
||||
test_windows:
|
||||
name: Test Windows
|
||||
runs-on: ${{ matrix.os }}
|
||||
strategy:
|
||||
matrix:
|
||||
os: [windows-latest]
|
||||
python: ["3.9"]
|
||||
|
||||
env:
|
||||
OS: ${{ matrix.os }}
|
||||
PYTHON: ${{ matrix.python }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
name: Install Python
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
|
||||
- name: Install system packages
|
||||
run: |
|
||||
choco install --yes --no-progress --pre tesseract
|
||||
choco install --yes --no-progress ghostscript
|
||||
choco install --yes --no-progress pngquant
|
||||
|
||||
- name: Install Python packages
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -r requirements/main.txt -r requirements/test.txt .
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
python -m pytest --cov-report xml --cov=ocrmypdf --cov=tests/ -n0 tests/
|
||||
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v1
|
||||
with:
|
||||
files: ./coverage.xml
|
||||
env_vars: OS,PYTHON
|
||||
|
||||
wheel_sdist_linux:
|
||||
name: Build sdist and wheels
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
- uses: actions/setup-python@v2
|
||||
name: Install Python
|
||||
with:
|
||||
python-version: "3.6"
|
||||
|
||||
- name: Make wheels and sdist
|
||||
run: |
|
||||
python -m pip install --upgrade pip wheel
|
||||
python setup.py sdist
|
||||
python setup.py bdist_wheel
|
||||
|
||||
- uses: actions/upload-artifact@v2
|
||||
with:
|
||||
path: |
|
||||
./dist/*.whl
|
||||
./dist/*.tar.gz
|
||||
|
||||
upload_pypi:
|
||||
name: Deploy artifacts to PyPI
|
||||
needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
|
||||
steps:
|
||||
- uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: artifact
|
||||
path: dist
|
||||
|
||||
- uses: pypa/gh-action-pypi-publish@master
|
||||
with:
|
||||
user: __token__
|
||||
password: ${{ secrets.TOKEN_PYPI }}
|
||||
# repository_url: https://test.pypi.org/legacy/
|
||||
|
||||
docker:
|
||||
name: Build Docker images
|
||||
needs: [wheel_sdist_linux, test_linux, test_macos, test_windows]
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Set image tag to release or branch
|
||||
run: echo "DOCKER_IMAGE_TAG=${GITHUB_REF##*/}" >> $GITHUB_ENV
|
||||
|
||||
- name: If master, set to latest
|
||||
run: echo 'DOCKER_IMAGE_TAG=latest' >> $GITHUB_ENV
|
||||
if: env.DOCKER_IMAGE_TAG == 'master'
|
||||
|
||||
- name: Set Docker Hub repository to username
|
||||
run: echo "DOCKER_REPOSITORY=jbarlow83" >> $GITHUB_ENV
|
||||
|
||||
- name: Set image name
|
||||
run: echo "DOCKER_IMAGE_NAME=ocrmypdf" >> $GITHUB_ENV
|
||||
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: "0" # 0=all, needed for setuptools-scm to resolve version tags
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: jbarlow83
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
id: buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
|
||||
- name: Print image tag
|
||||
run: echo "Building image ${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}"
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
docker buildx build \
|
||||
--push \
|
||||
--platform linux/arm64/v8,linux/amd64 \
|
||||
--tag "${DOCKER_REPOSITORY}/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_TAG}" \
|
||||
--file .docker/Dockerfile .
|
||||
60
.gitignore
vendored
60
.gitignore
vendored
@@ -1,44 +1,44 @@
|
||||
# Development environment
|
||||
.bash_history
|
||||
.pylintrc
|
||||
.pytest_cache/
|
||||
.ruffus_history.sqlite
|
||||
.venv/
|
||||
*.pyc
|
||||
*.sublime-*
|
||||
# dotfiles
|
||||
.*
|
||||
!.coveragerc
|
||||
!.dockerignore
|
||||
!.git_archival.txt
|
||||
!.gitattributes
|
||||
!.gitignore
|
||||
!.pre-commit-config.yaml
|
||||
!.readthedocs.yml
|
||||
!.github/
|
||||
|
||||
# Dev scratch
|
||||
*.ipynb
|
||||
**/*.pyc
|
||||
/*.pdf
|
||||
/*.qdf
|
||||
/*.png
|
||||
/scratch.py
|
||||
IDEAS
|
||||
log/
|
||||
tests/resources/private/
|
||||
tmp/
|
||||
venv*/
|
||||
/debug_tests.py
|
||||
*.traineddata
|
||||
/private
|
||||
/coverage.xml
|
||||
|
||||
# Package building
|
||||
.eggs/
|
||||
*.egg-info/
|
||||
build/
|
||||
dist/
|
||||
wheelhouse/
|
||||
pip-wheel-metadata/
|
||||
|
||||
# Code coverage
|
||||
htmlcov/
|
||||
|
||||
# Automatically generated files
|
||||
docs/_build/
|
||||
docs/_static/
|
||||
docs/_templates/
|
||||
docs/Makefile
|
||||
ocrmypdf/lib/_*.py
|
||||
|
||||
# Code coverage
|
||||
.coverage*
|
||||
htmlcov/
|
||||
|
||||
# Testing
|
||||
.ipynb_checkpoints/
|
||||
.vscode/
|
||||
*.ipynb
|
||||
*.profile
|
||||
/*.pdf
|
||||
/*.qdf
|
||||
/*.png
|
||||
/scratch.py
|
||||
IDEAS
|
||||
log/
|
||||
tests/output/
|
||||
tests/resources/private/
|
||||
tmp/
|
||||
/debug_tests.py
|
||||
*.traineddata
|
||||
|
||||
@@ -1,6 +1,23 @@
|
||||
repos:
|
||||
- repo: https://github.com/ambv/black
|
||||
rev: stable
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v3.4.0
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3.7
|
||||
- id: check-case-conflict
|
||||
- id: check-merge-conflict
|
||||
- id: check-toml
|
||||
- id: check-yaml
|
||||
- id: debug-statements
|
||||
- repo: https://github.com/asottile/seed-isort-config
|
||||
rev: v2.2.0
|
||||
hooks:
|
||||
- id: seed-isort-config
|
||||
- repo: https://github.com/pre-commit/mirrors-isort
|
||||
rev: v5.7.0 # pick the isort version you'd like to use from https://github.com/pre-commit/mirrors-isort/releases
|
||||
hooks:
|
||||
- id: isort
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 20.8b1
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python
|
||||
exclude: ^src/ocrmypdf/lib/_leptonica.py
|
||||
|
||||
146
.travis.yml
146
.travis.yml
@@ -1,146 +0,0 @@
|
||||
cache:
|
||||
pip: true
|
||||
directories:
|
||||
- $HOME/Library/Caches/Homebrew
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- os: linux
|
||||
dist: trusty
|
||||
sudo: required
|
||||
language: python
|
||||
python: "3.6"
|
||||
env:
|
||||
- DIST=trusty
|
||||
addons: &trusty_apt
|
||||
apt:
|
||||
update: true
|
||||
sources:
|
||||
- sourceline: 'ppa:alex-p/tesseract-ocr'
|
||||
- sourceline: 'ppa:heyarje/libav-11'
|
||||
- sourceline: 'ppa:vshn/ghostscript'
|
||||
packages:
|
||||
- ghostscript
|
||||
- libavcodec56
|
||||
- libavformat56
|
||||
- libavutil54
|
||||
- libexempi3
|
||||
- libffi-dev
|
||||
- pngquant
|
||||
- poppler-utils
|
||||
- qpdf
|
||||
- tesseract-ocr
|
||||
- tesseract-ocr-deu
|
||||
- tesseract-ocr-eng
|
||||
- tesseract-ocr-fra
|
||||
- os: linux
|
||||
dist: xenial
|
||||
sudo: required
|
||||
language: python
|
||||
python: "3.7"
|
||||
env:
|
||||
- DIST=xenial
|
||||
addons:
|
||||
apt:
|
||||
update: true
|
||||
sources:
|
||||
- sourceline: 'ppa:alex-p/tesseract-ocr'
|
||||
packages:
|
||||
- ghostscript
|
||||
- libexempi3
|
||||
- libffi-dev
|
||||
- pngquant
|
||||
- poppler-utils
|
||||
- qpdf
|
||||
- tesseract-ocr
|
||||
- tesseract-ocr-deu
|
||||
- tesseract-ocr-eng
|
||||
- tesseract-ocr-fra
|
||||
- unpaper
|
||||
- os: osx
|
||||
osx_image: xcode9.2
|
||||
language: generic
|
||||
addons:
|
||||
homebrew:
|
||||
update: true
|
||||
packages:
|
||||
- exempi
|
||||
- ghostscript
|
||||
- jbig2enc
|
||||
- leptonica
|
||||
- openjpeg
|
||||
- pngquant
|
||||
- python
|
||||
- qpdf
|
||||
- tesseract
|
||||
- unpaper
|
||||
- os: osx
|
||||
osx_image: xcode9.2
|
||||
language: generic
|
||||
env:
|
||||
- ADD_PDFMINER=1
|
||||
addons:
|
||||
homebrew:
|
||||
update: true
|
||||
packages:
|
||||
- exempi
|
||||
- ghostscript
|
||||
- jbig2enc
|
||||
- leptonica
|
||||
- openjpeg
|
||||
- pngquant
|
||||
- python
|
||||
- qpdf
|
||||
- tesseract
|
||||
- unpaper
|
||||
|
||||
before_cache:
|
||||
- rm -f $HOME/.cache/pip/log/debug.log
|
||||
|
||||
before_install: |
|
||||
mkdir -p bin
|
||||
if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
|
||||
pip3 install --upgrade pip
|
||||
pip3 install --upgrade wheel
|
||||
if [[ "$DIST" == "trusty" ]]; then
|
||||
mkdir -p packages
|
||||
wget -q 'https://www.dropbox.com/s/vaq0kbwi6e6au80/unpaper_6.1-1.deb?raw=1' -O packages/unpaper_6.1-1.deb
|
||||
sudo dpkg -i packages/unpaper_6.1-1.deb
|
||||
fi
|
||||
elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
|
||||
pip3 install --upgrade pip
|
||||
pip3 install wheel
|
||||
fi
|
||||
|
||||
install:
|
||||
- export PATH=$PWD/bin:$PATH
|
||||
- pip3 install pycparser # py3.7 workaround for https://github.com/eliben/pycparser/issues/251
|
||||
- pip3 install -r requirements/main.txt
|
||||
- pip3 install --no-deps .
|
||||
- |
|
||||
if [[ "$ADD_PDFMINER" == "1" ]]; then
|
||||
pip3 install --no-deps .[pdfminer]
|
||||
fi
|
||||
- pip3 install -r requirements/test.txt
|
||||
|
||||
script:
|
||||
- tesseract --version
|
||||
- qpdf --version
|
||||
- pytest -n auto
|
||||
|
||||
deploy:
|
||||
# release for main pypi
|
||||
# 3.6 is considered the build leader and does the deploy, otherwise there is
|
||||
# a race and all versions will try to deploy
|
||||
# OTOH if we ever need separate binary wheels then each version needs its
|
||||
# own deploy
|
||||
- provider: pypi
|
||||
user: ocrmypdf-travis
|
||||
password:
|
||||
secure: "DTFOmmNL6olA0+yXvp4u9jXZlZeqrJsJ0526jzqf4a3gZ6jnGTq5UI6WzRsslSyoMMfXKtHQebqHM6ogSgCZinyZ3ufHJo8fn9brxbEc2gsiWkbj5o3bGwdWMT1vNNE7XW0VCpw87rZ1EEwjl4FJHFudMlPR1yfU5+uq0k0PACo="
|
||||
distributions: "sdist bdist_wheel"
|
||||
on:
|
||||
branch: master
|
||||
tags: true
|
||||
condition: $TRAVIS_PYTHON_VERSION == "3.6" && $TRAVIS_OS_NAME == "linux"
|
||||
skip_upload_docs: true
|
||||
43
MANIFEST.in
43
MANIFEST.in
@@ -1,43 +0,0 @@
|
||||
# requirements
|
||||
recursive-include requirements *
|
||||
|
||||
# git
|
||||
include .git_archival.txt
|
||||
|
||||
# docker
|
||||
include .dockerignore
|
||||
recursive-include .docker *
|
||||
|
||||
# tests
|
||||
include .coveragerc
|
||||
recursive-include tests *.bin
|
||||
recursive-include tests *.jpg
|
||||
recursive-include tests *.jsonl
|
||||
recursive-include tests *.png
|
||||
recursive-include tests *.pdf
|
||||
recursive-include tests *.py
|
||||
recursive-include tests *.rst
|
||||
recursive-include tests *.txt
|
||||
recursive-exclude tests/resources/private *
|
||||
|
||||
# documentation
|
||||
include LICENSE
|
||||
include *.rst
|
||||
recursive-exclude .github *
|
||||
recursive-include docs *.py
|
||||
recursive-include docs *.rst
|
||||
recursive-include docs *.svg
|
||||
recursive-exclude docs/_build *
|
||||
|
||||
|
||||
# support files
|
||||
recursive-include src/ocrmypdf/data *
|
||||
include *.py
|
||||
exclude tasks.py
|
||||
recursive-exclude .travis *
|
||||
exclude .travis*
|
||||
|
||||
|
||||
# code
|
||||
exclude src/ocrmypdf/lib/_leptonica.py
|
||||
exclude scratch.py
|
||||
112
README.md
112
README.md
@@ -1,15 +1,13 @@
|
||||
OCRmyPDF
|
||||
========
|
||||
<img src="docs/images/logo.svg" width="240" alt="OCRmyPDF">
|
||||
|
||||
[![Travis build status][travis]](https://travis-ci.org/jbarlow83/OCRmyPDF) [![PyPI version][pypi]](https://pypi.org/project/ocrmypdf/) ![Homebrew version][homebrew] ![ReadTheDocs][docs]
|
||||
[](https://github.com/jbarlow83/OCRmyPDF/actions/workflows/build.yml) [![PyPI version][pypi]](https://pypi.org/project/ocrmypdf/) ![Homebrew version][homebrew] ![ReadTheDocs][docs] ![Python versions][pyversions]
|
||||
|
||||
[azure]: https://dev.azure.com/jim0585/ocrmypdf/_apis/build/status/jbarlow83.OCRmyPDF?branchName=master
|
||||
[travis]: https://travis-ci.org/jbarlow83/OCRmyPDF.svg?branch=master "Travis build status"
|
||||
|
||||
[pypi]: https://img.shields.io/pypi/v/ocrmypdf.svg "PyPI version"
|
||||
|
||||
[homebrew]: https://img.shields.io/homebrew/v/ocrmypdf.svg "Homebrew version"
|
||||
|
||||
[docs]: https://readthedocs.org/projects/ocrmypdf/badge/?version=latest "RTD"
|
||||
[pyversions]: https://img.shields.io/pypi/pyversions/ocrmypdf "Supported Python versions"
|
||||
|
||||
OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched or copy-pasted.
|
||||
|
||||
@@ -27,15 +25,14 @@ ocrmypdf # it's a scriptable command line program
|
||||
|
||||
[See the release notes for details on the latest changes](https://ocrmypdf.readthedocs.io/en/latest/release_notes.html).
|
||||
|
||||
Main features
|
||||
-------------
|
||||
## Main features
|
||||
|
||||
- Generates a searchable [PDF/A](https://en.wikipedia.org/?title=PDF/A) file from a regular PDF
|
||||
- Places OCR text accurately below the image to ease copy / paste
|
||||
- Keeps the exact resolution of the original embedded images
|
||||
- When possible, inserts OCR information as a "lossless" operation without disrupting any other content
|
||||
- Optimizes PDF images, often producing files smaller than the input file
|
||||
- If requested deskews and/or cleans the image before performing OCR
|
||||
- If requested, deskews and/or cleans the image before performing OCR
|
||||
- Validates input and output files
|
||||
- Distributes work across all available CPU cores
|
||||
- Uses [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) engine to recognize more than [100 languages](https://github.com/tesseract-ocr/tessdata)
|
||||
@@ -44,10 +41,9 @@ Main features
|
||||
|
||||
For details: please consult the [documentation](https://ocrmypdf.readthedocs.io/en/latest/).
|
||||
|
||||
Motivation
|
||||
----------
|
||||
## Motivation
|
||||
|
||||
I searched the web for a free command line tool to OCR PDF files on Linux/UNIX: I found many, but none of them were really satisfying.
|
||||
I searched the web for a free command line tool to OCR PDF files: I found many, but none of them were really satisfying:
|
||||
|
||||
- Either they produced PDF files with misplaced text under the image (making copy/paste impossible)
|
||||
- Or they did not handle accents and multilingual characters
|
||||
@@ -59,33 +55,23 @@ I searched the web for a free command line tool to OCR PDF files on Linux/UNIX:
|
||||
|
||||
...so I decided to develop my own tool.
|
||||
|
||||
Installation
|
||||
------------
|
||||
## Installation
|
||||
|
||||
Linux, UNIX, and macOS are supported. Windows is not directly supported but there is a Docker image available that runs on Windows.
|
||||
Linux, Windows, macOS and FreeBSD are supported. Docker images are also available.
|
||||
|
||||
Users of Debian 9 or later or Ubuntu 16.10 or later may simply
|
||||
|
||||
```bash
|
||||
apt-get install ocrmypdf
|
||||
```
|
||||
|
||||
and users of Fedora 29 or later may simply
|
||||
|
||||
```bash
|
||||
dnf install ocrmypdf
|
||||
```
|
||||
|
||||
and macOS users with Homebrew may simply
|
||||
|
||||
```bash
|
||||
brew install ocrmypdf
|
||||
```
|
||||
| Operating system | Install command |
|
||||
| ----------------------------- | ------------------------------|
|
||||
| Debian, Ubuntu | ``apt install ocrmypdf`` |
|
||||
| Windows Subsystem for Linux | ``apt install ocrmypdf`` |
|
||||
| Fedora | ``dnf install ocrmypdf`` |
|
||||
| macOS | ``brew install ocrmypdf`` |
|
||||
| LinuxBrew | ``brew install ocrmypdf`` |
|
||||
| FreeBSD | ``pkg install py37-ocrmypdf`` |
|
||||
| Conda | ``conda install ocrmypdf`` |
|
||||
|
||||
For everyone else, [see our documentation](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for installation steps.
|
||||
|
||||
Languages
|
||||
---------
|
||||
## Languages
|
||||
|
||||
OCRmyPDF uses Tesseract for OCR, and relies on its language packs. For Linux users, you can often find packages that provide language packs:
|
||||
|
||||
@@ -94,15 +80,20 @@ OCRmyPDF uses Tesseract for OCR, and relies on its language packs. For Linux use
|
||||
apt-cache search tesseract-ocr
|
||||
|
||||
# Debian/Ubuntu users
|
||||
apt-get install tesseract-ocr-chi-sim # Example: Install Chinese Simplified language back
|
||||
apt-get install tesseract-ocr-chi-sim # Example: Install Chinese Simplified language pack
|
||||
|
||||
# Arch Linux users
|
||||
pacman -S tesseract-data-eng tesseract-data-deu # Example: Install the English and German language packs
|
||||
|
||||
# brew macOS users
|
||||
brew install tesseract-lang
|
||||
```
|
||||
|
||||
You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.
|
||||
|
||||
Documentation and support
|
||||
-------------------------
|
||||
## Documentation and support
|
||||
|
||||
Once ocrmypdf is installed, the built-in help which explains the command syntax and options can be accessed via:
|
||||
Once OCRmyPDF is installed, the built-in help which explains the command syntax and options can be accessed via:
|
||||
|
||||
```bash
|
||||
ocrmypdf --help
|
||||
@@ -110,42 +101,37 @@ ocrmypdf --help
|
||||
|
||||
Our [documentation is served on Read the Docs](https://ocrmypdf.readthedocs.io/en/latest/index.html).
|
||||
|
||||
If you detect an issue, please:
|
||||
Please report issues on our [GitHub issues](https://github.com/jbarlow83/OCRmyPDF/issues) page, and follow the issue template for quick response.
|
||||
|
||||
- Check whether your issue is already known
|
||||
- If no problem report exists on github, please create one here: <https://github.com/jbarlow83/OCRmyPDF/issues>
|
||||
- Describe your problem thoroughly
|
||||
- Append the console output of the script when running the debug mode (`-v 1` option)
|
||||
- If possible provide your input PDF file as well as the content of the temporary folder (using a file sharing service like Dropbox)
|
||||
## Requirements
|
||||
|
||||
Requirements
|
||||
------------
|
||||
In addition to the required Python version (3.6+), OCRmyPDF requires external program installations of Ghostscript, Tesseract OCR, QPDF, and Leptonica. OCRmyPDF is pure Python, but uses CFFI to portably generate library bindings. OCRmyPDF works on pretty much everything: Linux, macOS, Windows and FreeBSD.
|
||||
|
||||
Runs on CPython 3.5, 3.6 and 3.7. Requires external program installations of Ghostscript, Tesseract OCR, QPDF, and Leptonica. ocrmypdf is pure Python, but uses CFFI to portably generate library bindings.
|
||||
|
||||
Press & Media
|
||||
-------------
|
||||
## Press & Media
|
||||
|
||||
- [Going paperless with OCRmyPDF](https://medium.com/@ikirichenko/going-paperless-with-ocrmypdf-e2f36143f46a)
|
||||
- [Converting a scanned document into a compressed searchable PDF with redactions](https://medium.com/@treyharris/converting-a-scanned-document-into-a-compressed-searchable-pdf-with-redactions-63f61c34fe4c)
|
||||
- [c't 1-2014, page 59](http://heise.de/-2279695): Detailed presentation of OCRmyPDF v1.0 in the leading German IT magazine c't
|
||||
- [heise Open Source, 09/2014: Texterkennung mit OCRmyPDF](http://heise.de/-2356670)
|
||||
- [c't 1-2014, page 59](https://heise.de/-2279695): Detailed presentation of OCRmyPDF v1.0 in the leading German IT magazine c't
|
||||
- [heise Open Source, 09/2014: Texterkennung mit OCRmyPDF](https://heise.de/-2356670)
|
||||
- [heise Durchsuchbare PDF-Dokumente mit OCRmyPDF erstellen](https://www.heise.de/ratgeber/Durchsuchbare-PDF-Dokumente-mit-OCRmyPDF-erstellen-4607592.html)
|
||||
- [Excellent Utilities: OCRmyPDF](https://www.linuxlinks.com/excellent-utilities-ocrmypdf-add-ocr-text-layer-scanned-pdfs/)
|
||||
|
||||
Business enquiries
|
||||
------------------
|
||||
## Business enquiries
|
||||
|
||||
OCRmyPDF would not be the software that it is today is without companies and users choosing to provide support for feature development and consulting enquiries. We are happy to discuss all enquiries, whether for extending the existing feature set, or integrating OCRmyPDF into a larger system.
|
||||
OCRmyPDF would not be the software that it is today without companies and users choosing to provide support for feature development and consulting enquiries. We are happy to discuss all enquiries, whether for extending the existing feature set, or integrating OCRmyPDF into a larger system.
|
||||
|
||||
License
|
||||
-------
|
||||
## License
|
||||
|
||||
The OCRmyPDF software is licensed under the GNU GPLv3. Certain files are covered by other licenses, as noted in their source files.
|
||||
The OCRmyPDF software is licensed under the Mozilla Public License 2.0
|
||||
(MPL-2.0). This license permits integration of OCRmyPDF with other code,
|
||||
included commercial and closed source, but asks you to publish source-level
|
||||
modifications you make to OCRmyPDF.
|
||||
|
||||
The license for each test file varies, and is noted in tests/resources/README.rst. The documentation is licensed under Creative Commons Attribution-ShareAlike 4.0 (CC-BY-SA 4.0).
|
||||
Some components of OCRmyPDF have other licenses, as noted in those files and the
|
||||
``debian/copyright`` file. Most files in ``misc/`` use the MIT license, and the
|
||||
documentation and test files are generally licensed under Creative Commons
|
||||
ShareAlike 4.0 (CC-BY-SA 4.0).
|
||||
|
||||
OCRmyPDF versions prior to 6.0 were distributed under the MIT License.
|
||||
|
||||
Disclaimer
|
||||
----------
|
||||
## Disclaimer
|
||||
|
||||
The software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
||||
743
debian/copyright
vendored
743
debian/copyright
vendored
@@ -2,35 +2,70 @@ Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: OCRmyPDF
|
||||
Upstream-Contact: James R. Barlow <barlow.jim@gmail.com>
|
||||
Source: https://github.com/jbarlow83/OCRmyPDF
|
||||
Files-Excluded: tests/resources/milk.pdf
|
||||
|
||||
Files: *
|
||||
Copyright:
|
||||
(C) 2013-2017 The OCRmyPDF Authors
|
||||
(C) 2013-2016, 2015-2017 2016, 2017, 2017-2018, 2018 James R. Barlow
|
||||
License: GPL-3+
|
||||
(C) 2013-2015 Julien Pfefferkorn
|
||||
(C) 2015-2020 James R. Barlow
|
||||
(C) 2019 Martin Wind
|
||||
License: MPL-2.0
|
||||
|
||||
Files: misc/*
|
||||
Copyright:
|
||||
(C) 2020 James R. Barlow
|
||||
License: Expat
|
||||
|
||||
Files: misc/completion/ocrmypdf.bash
|
||||
Copyright:
|
||||
(C) 2019 Frank Pille
|
||||
(C) 2020 Alex Willner
|
||||
License: Expat
|
||||
|
||||
Files: misc/completion/ocrmypdf.fish
|
||||
Copyright:
|
||||
(C) 2020 James R. Barlow
|
||||
License: Expat
|
||||
|
||||
Files: misc/batch.py
|
||||
Copyright:
|
||||
(C) 2016 findingorder: https://github.com/findingorder
|
||||
License: Expat
|
||||
|
||||
Files: misc/synology.py
|
||||
Copyright:
|
||||
(C) github.com/Enantiomerie
|
||||
License: Expat
|
||||
|
||||
Files: misc/watcher.py
|
||||
Copyright:
|
||||
(C) 2019 Ian Alexander: https://github.com/ianalexander
|
||||
(C) 2020 James R. Barlow
|
||||
License: Expat
|
||||
|
||||
Files: misc/webservice.py
|
||||
Copyright: (C) 2019 James R. Barlow
|
||||
License: AGPL-3+
|
||||
|
||||
Files: docs tests/resources/*
|
||||
Copyright: (C) 2013-2018 James R. Barlow
|
||||
License: CC-BY-SA-4.0
|
||||
|
||||
Files: docs/images/bitmap_vs_svg.svg
|
||||
Copyright: (C) 2006 Yug
|
||||
License: CC-BY-SA-2.5
|
||||
|
||||
Files: src/ocrmypdf/hocrtransform.py
|
||||
Copyright: (C) 2010 Jonathan Brinley <jonathanbrinley@gmail.com>
|
||||
(C) 2013-14 Julien Pfefferkorn
|
||||
(C) 2015-16 James R. Barlow
|
||||
License: Expat
|
||||
|
||||
Files: src/ocrmypdf/pdfa.py
|
||||
Copyright: (C) 2015 James R. Barlow
|
||||
(C) 1986-2017 The authors of GhostScript
|
||||
License: GPL-3+
|
||||
|
||||
Files: src/ocrmypdf/_unicodefun.py
|
||||
Copyright: (C) 2014 Armin Ronacher
|
||||
(C) 2017 James R. Barlow
|
||||
License: BSD-3-clause
|
||||
|
||||
Files: tests/spoof/*
|
||||
Files: tests/plugins/*
|
||||
Copyright: (C) 2016, 2017, 2016-2018 James R. Barlow
|
||||
License: Expat
|
||||
|
||||
@@ -82,12 +117,17 @@ License: CC-BY-SA-3.0
|
||||
Files: tests/resources/typewriter.png tests/resources/2400dpi.pdf
|
||||
Copyright: (C) 2005 Ellywa
|
||||
License: GFDL-1.2+ or CC-BY-SA-1.0 or CC-BY-SA-2.0 or CC-BY-SA-2.5 or CC-BY-SA-3.0
|
||||
Comment:
|
||||
Obtained from: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif
|
||||
|
||||
Files: tests/resources/overlay.pdf
|
||||
Copyright: (C) 2017 Max Anderson
|
||||
License: Expat
|
||||
|
||||
Files: tests/resources/baiona*.png
|
||||
Files:
|
||||
tests/resources/baiona*.png
|
||||
tests/resources/baiona*.jpg
|
||||
tests/resources/link.pdf
|
||||
Copyright: (C) 2014 Euskaldunaa
|
||||
License: CC-BY-SA-4.0
|
||||
|
||||
@@ -95,11 +135,12 @@ Files: tests/resources/vector.pdf
|
||||
Copyright: (C) 2018 Catscratch
|
||||
License: Expat
|
||||
|
||||
Files: test/resources/enron*.pdf
|
||||
Copyright: EnronData.org
|
||||
License: CC-BY-3.0
|
||||
See: https://enrondata.readthedocs.io/en/latest/data/edo-enron-email-pst-dataset/
|
||||
Comment: Unprocessed.
|
||||
Files: tests/resources/3small.pdf
|
||||
Copyright: (C) 2014 Euskaldunaa
|
||||
(C) 2017 James R. Barlow
|
||||
(C) 2005 Ellywa
|
||||
License: CC-BY-SA-4.0 and (GFDL-1.2+ or CC-BY-SA-1.0 or CC-BY-SA-2.0 or CC-BY-SA-2.5 or CC-BY-SA-3.0)
|
||||
Comment: concatenation of baiona_gray.png, crom.png and typewriter.png/2400dpi.pdf
|
||||
|
||||
Files: src/ocrmypdf/data/sRGB.icc
|
||||
Copyright: Kai-Uwe Behrmann <www.behrmann.name>
|
||||
@@ -113,6 +154,13 @@ Files: debian/*
|
||||
Copyright: (C) 2016 Sean Whitton <spwhitton@spwhitton.name>
|
||||
License: GPL-3+
|
||||
|
||||
License: MPL-2.0
|
||||
This Source Code Form is subject to the terms of the Mozilla Public
|
||||
License, v. 2.0.
|
||||
.
|
||||
On Debian systems the full text of the MPL-2.0 can be found in
|
||||
/usr/share/common-licenses/MPL-2.0.
|
||||
|
||||
License: GPL-3+
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
@@ -130,6 +178,669 @@ License: GPL-3+
|
||||
On Debian systems, the complete text of the GNU General
|
||||
Public License version 3 can be found in "/usr/share/common-licenses/GPL-3".
|
||||
|
||||
License: AGPL-3+
|
||||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
.
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
.
|
||||
Preamble
|
||||
.
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
.
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
.
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
.
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
.
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
.
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
.
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
.
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
.
|
||||
TERMS AND CONDITIONS
|
||||
.
|
||||
0. Definitions.
|
||||
.
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
.
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
.
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
.
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
.
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
.
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
.
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
.
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
.
|
||||
1. Source Code.
|
||||
.
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
.
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
.
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
.
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
.
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
.
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
.
|
||||
2. Basic Permissions.
|
||||
.
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
.
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
.
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
.
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
.
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
.
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
.
|
||||
4. Conveying Verbatim Copies.
|
||||
.
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
.
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
.
|
||||
5. Conveying Modified Source Versions.
|
||||
.
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
.
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
.
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
.
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
.
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
.
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
.
|
||||
6. Conveying Non-Source Forms.
|
||||
.
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
.
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
.
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
.
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
.
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
.
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
.
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
.
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
.
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
.
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
.
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
.
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
.
|
||||
7. Additional Terms.
|
||||
.
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
.
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
.
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
.
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
.
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
.
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
.
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
.
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
.
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
.
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
.
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
.
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
.
|
||||
8. Termination.
|
||||
.
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
.
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
.
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
.
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
.
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
.
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
.
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
.
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
.
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
.
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
.
|
||||
11. Patents.
|
||||
.
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
.
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
.
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
.
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
.
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
.
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
.
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
.
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
.
|
||||
12. No Surrender of Others' Freedom.
|
||||
.
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
.
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
.
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
.
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
.
|
||||
14. Revised Versions of this License.
|
||||
.
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
.
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
.
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
.
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
.
|
||||
15. Disclaimer of Warranty.
|
||||
.
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
.
|
||||
16. Limitation of Liability.
|
||||
.
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
.
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
.
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
.
|
||||
END OF TERMS AND CONDITIONS
|
||||
.
|
||||
How to Apply These Terms to Your New Programs
|
||||
.
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
.
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
.
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
.
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
.
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
.
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
.
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
.
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
.
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<http://www.gnu.org/licenses/>.
|
||||
|
||||
License: Expat
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
|
||||
@@ -1,16 +1,34 @@
|
||||
=================
|
||||
Advanced features
|
||||
=================
|
||||
|
||||
Control of unpaper
|
||||
------------------
|
||||
==================
|
||||
|
||||
OCRmyPDF uses ``unpaper`` to provide the implementation of the ``--clean`` and ``--clean-final`` arguments. `unpaper <https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md>`_ provides a variety of image processing filters to improve images.
|
||||
OCRmyPDF uses ``unpaper`` to provide the implementation of the
|
||||
``--clean`` and ``--clean-final`` arguments.
|
||||
`unpaper <https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md>`__
|
||||
provides a variety of image processing filters to improve images.
|
||||
|
||||
By default, OCRmyPDF uses only ``unpaper`` arguments that were found to be safe to use on almost all files without having to inspect every page of the file afterwards. This is particularly true when only ``--clean`` is used, since that instructs OCRmyPDF to only clean the image before OCR and not the final image.
|
||||
By default, OCRmyPDF uses only ``unpaper`` arguments that were found to
|
||||
be safe to use on almost all files without having to inspect every page
|
||||
of the file afterwards. This is particularly true when only ``--clean``
|
||||
is used, since that instructs OCRmyPDF to only clean the image before
|
||||
OCR and not the final image.
|
||||
|
||||
However, if you wish to use the more aggressive options in ``unpaper``, you may use ``--unpaper-args '...'`` to override the OCRmyPDF's defaults and forward other arguments to unpaper. This option will forward arguments to ``unpaper`` without any knowledge of what that program considers to be valid arguments. The string of arguments must be quoted as shown in the examples below. No filename arguments may be included. OCRmyPDF will assume it can append input and output filename of intermediate images to the ``--unpaper-args`` string.
|
||||
However, if you wish to use the more aggressive options in ``unpaper``,
|
||||
you may use ``--unpaper-args '...'`` to override the OCRmyPDF's defaults
|
||||
and forward other arguments to unpaper. This option will forward
|
||||
arguments to ``unpaper`` without any knowledge of what that program
|
||||
considers to be valid arguments. The string of arguments must be quoted
|
||||
as shown in the examples below. No filename arguments may be included.
|
||||
OCRmyPDF will assume it can append input and output filename of
|
||||
intermediate images to the ``--unpaper-args`` string.
|
||||
|
||||
In this example, we tell ``unpaper`` to expect two pages of text on a sheet (image), such as occurs when two facing pages of a book are scanned. ``unpaper`` uses this information to deskew each independently and clean up the margins of both.
|
||||
In this example, we tell ``unpaper`` to expect two pages of text on a
|
||||
sheet (image), such as occurs when two facing pages of a book are
|
||||
scanned. ``unpaper`` uses this information to deskew each independently
|
||||
and clean up the margins of both.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -19,40 +37,71 @@ In this example, we tell ``unpaper`` to expect two pages of text on a sheet (ima
|
||||
|
||||
.. warning::
|
||||
|
||||
Some ``unpaper`` features will reposition text within the image. ``--clean-final`` is recommended to avoid this issue.
|
||||
Some ``unpaper`` features will reposition text within the image.
|
||||
``--clean-final`` is recommended to avoid this issue.
|
||||
|
||||
.. warning::
|
||||
|
||||
Some ``unpaper`` features cause multiple input or output files to be consumed or produced. OCRmyPDF requires ``unpaper`` to consume one file and produce one file. An deviation from that condition will result in errors.
|
||||
Some ``unpaper`` features cause multiple input or output files to be
|
||||
consumed or produced. OCRmyPDF requires ``unpaper`` to consume one
|
||||
file and produce one file. An deviation from that condition will
|
||||
result in errors.
|
||||
|
||||
.. note::
|
||||
|
||||
``unpaper`` uses uncompressed PBM/PGM/PPM files for its intermediate files. For large images or documents, it can take a lot of temporary disk space.
|
||||
``unpaper`` uses uncompressed PBM/PGM/PPM files for its intermediate
|
||||
files. For large images or documents, it can take a lot of temporary
|
||||
disk space.
|
||||
|
||||
Control of OCR options
|
||||
----------------------
|
||||
======================
|
||||
|
||||
OCRmyPDF provides many features to control the behavior of the OCR engine, Tesseract.
|
||||
OCRmyPDF provides many features to control the behavior of the OCR
|
||||
engine, Tesseract.
|
||||
|
||||
When OCR is skipped
|
||||
"""""""""""""""""""
|
||||
-------------------
|
||||
|
||||
If a page in a PDF seems to have text, by default OCRmyPDF will exit without modifying the PDF. This is to ensure that PDFs that were previously OCRed or were "born digital" rather than scanned are not processed.
|
||||
If a page in a PDF seems to have text, by default OCRmyPDF will exit
|
||||
without modifying the PDF. This is to ensure that PDFs that were
|
||||
previously OCRed or were "born digital" rather than scanned are not
|
||||
processed.
|
||||
|
||||
If ``--skip-text`` is issued, then no OCR will be performed on pages that already have text. The page will be copied to the output. This may be useful for documents that contain both "born digital" and scanned content, or to use OCRmyPDF to normalize and convert to PDF/A regardless of their contents.
|
||||
If ``--skip-text`` is issued, then no OCR will be performed on pages
|
||||
that already have text. The page will be copied to the output. This may
|
||||
be useful for documents that contain both "born digital" and scanned
|
||||
content, or to use OCRmyPDF to normalize and convert to PDF/A regardless
|
||||
of their contents.
|
||||
|
||||
If ``--redo-ocr`` is issued, then a detailed text analysis is performed. Text is categorized as either visible or invisible. Invisible text (OCR) is stripped out. Then an image of each page is created with visible text masked out. The page image is sent for OCR, and any additional text is inserted as OCR. If a file contains a mix of text and bitmap images that contain text, OCRmyPDF will locate the additional text in images without disrupting the existing text.
|
||||
If ``--redo-ocr`` is issued, then a detailed text analysis is performed.
|
||||
Text is categorized as either visible or invisible. Invisible text (OCR)
|
||||
is stripped out. Then an image of each page is created with visible text
|
||||
masked out. The page image is sent for OCR, and any additional text is
|
||||
inserted as OCR. If a file contains a mix of text and bitmap images that
|
||||
contain text, OCRmyPDF will locate the additional text in images without
|
||||
disrupting the existing text.
|
||||
|
||||
If ``--force-ocr`` is issued, then all pages will be rasterized to images, discarding any hidden OCR text, and rasterizing any printable text. This is useful for redoing OCR, for fixing OCR text with a damaged character map (text is selectable but not searchable), and destroying redacted information. Any forms and vector graphics will be rasterized as well.
|
||||
If ``--force-ocr`` is issued, then all pages will be rasterized to
|
||||
images, discarding any hidden OCR text, and rasterizing any printable
|
||||
text. This is useful for redoing OCR, for fixing OCR text with a damaged
|
||||
character map (text is selectable but not searchable), and destroying
|
||||
redacted information. Any forms and vector graphics will be rasterized
|
||||
as well.
|
||||
|
||||
Time and image size limits
|
||||
""""""""""""""""""""""""""
|
||||
--------------------------
|
||||
|
||||
By default, OCRmyPDF permits tesseract to run for three minutes (180 seconds) per page. This is usually more than enough time to find all text on a reasonably sized page with modern hardware.
|
||||
By default, OCRmyPDF permits tesseract to run for three minutes (180
|
||||
seconds) per page. This is usually more than enough time to find all
|
||||
text on a reasonably sized page with modern hardware.
|
||||
|
||||
If a page is skipped, it will be inserted without OCR. If preprocessing was requested, the preprocessed image layer will be inserted.
|
||||
If a page is skipped, it will be inserted without OCR. If preprocessing
|
||||
was requested, the preprocessed image layer will be inserted.
|
||||
|
||||
If you want to adjust the amount of time spent on OCR, change ``--tesseract-timeout``. You can also automatically skip images that exceed a certain number of megapixels with ``--skip-big``. (A 300 DPI, 8.5×11" page is 8.4 megapixels.)
|
||||
If you want to adjust the amount of time spent on OCR, change
|
||||
``--tesseract-timeout``. You can also automatically skip images that
|
||||
exceed a certain number of megapixels with ``--skip-big``. (A 300 DPI,
|
||||
8.5×11" page is 8.4 megapixels.)
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -60,21 +109,26 @@ If you want to adjust the amount of time spent on OCR, change ``--tesseract-time
|
||||
ocrmypdf --tesseract-timeout 300 --skip-big 50 bigfile.pdf output.pdf
|
||||
|
||||
Overriding default tesseract
|
||||
""""""""""""""""""""""""""""
|
||||
----------------------------
|
||||
|
||||
OCRmyPDF checks the system ``PATH`` for the ``tesseract`` binary.
|
||||
|
||||
Some relevant environment variables that influence Tesseract's behavior include:
|
||||
Some relevant environment variables that influence Tesseract's behavior
|
||||
include:
|
||||
|
||||
.. envvar:: TESSDATA_PREFIX
|
||||
|
||||
Overrides the path to Tesseract's data files. This can allow simultaneous installation of the "best" and "fast" training data sets. OCRmyPDF does not manage this environment variable.
|
||||
Overrides the path to Tesseract's data files. This can allow
|
||||
simultaneous installation of the "best" and "fast" training data
|
||||
sets. OCRmyPDF does not manage this environment variable.
|
||||
|
||||
.. envvar:: OMP_THREAD_LIMIT
|
||||
|
||||
Controls the number of threads Tesseract will use. OCRmyPDF will manage this environment if it is not already set. (Currently, it will set it to 1 because this gives the best results in testing.)
|
||||
Controls the number of threads Tesseract will use. OCRmyPDF will
|
||||
manage this environment variable if it is not already set.
|
||||
|
||||
For example, if you have a development build of Tesseract don't wish to use the system installation, you can launch OCRmyPDF as follows:
|
||||
For example, if you have a development build of Tesseract don't wish to
|
||||
use the system installation, you can launch OCRmyPDF as follows:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -83,26 +137,34 @@ For example, if you have a development build of Tesseract don't wish to use the
|
||||
TESSDATA_PREFIX=/home/user/src/tesseract \
|
||||
ocrmypdf input.pdf output.pdf
|
||||
|
||||
In this example ``TESSDATA_PREFIX`` is required to redirect Tesseract to an alternate folder for its "tessdata" files.
|
||||
In this example ``TESSDATA_PREFIX`` is required to redirect Tesseract to
|
||||
an alternate folder for its "tessdata" files.
|
||||
|
||||
Overriding other support programs
|
||||
"""""""""""""""""""""""""""""""""
|
||||
---------------------------------
|
||||
|
||||
In addition to tesseract, OCRmyPDF uses the following external binaries:
|
||||
|
||||
* ``gs`` (Ghostscript)
|
||||
* ``unpaper``
|
||||
* ``qpdf``
|
||||
|
||||
In each case OCRmyPDF will search the ``PATH`` environment variable to locate the binaries.
|
||||
- ``gs`` (Ghostscript)
|
||||
- ``unpaper``
|
||||
- ``pngquant``
|
||||
- ``jbig2``
|
||||
|
||||
In each case OCRmyPDF will search the ``PATH`` environment variable to
|
||||
locate the binaries.
|
||||
|
||||
Changing tesseract configuration variables
|
||||
""""""""""""""""""""""""""""""""""""""""""
|
||||
------------------------------------------
|
||||
|
||||
You can override tesseract's default `control parameters <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>`_ with a configuration file.
|
||||
You can override tesseract's default `control
|
||||
parameters <https://github.com/tesseract-ocr/tesseract/wiki/ControlParams>`__
|
||||
with a configuration file.
|
||||
|
||||
As an example, this configuration will disable Tesseract's dictionary for current language. Normally the dictionary is helpful for interpolating words that are unclear, but it may interfere with OCR if the document does not contain many words (for example, a list of part numbers).
|
||||
As an example, this configuration will disable Tesseract's dictionary
|
||||
for current language. Normally the dictionary is helpful for
|
||||
interpolating words that are unclear, but it may interfere with OCR if
|
||||
the document does not contain many words (for example, a list of part
|
||||
numbers).
|
||||
|
||||
Create a file named "no-dict.cfg" with these contents:
|
||||
|
||||
@@ -120,11 +182,11 @@ then run ocrmypdf as follows (along with any other desired arguments):
|
||||
|
||||
.. warning::
|
||||
|
||||
Some combinations of control parameters will break Tesseract or break assumptions that OCRmyPDF makes about Tesseract's output.
|
||||
|
||||
Some combinations of control parameters will break Tesseract or break
|
||||
assumptions that OCRmyPDF makes about Tesseract's output.
|
||||
|
||||
Changing the PDF renderer
|
||||
-------------------------
|
||||
=========================
|
||||
|
||||
rasterizing
|
||||
Converting a PDF to an image for display.
|
||||
@@ -132,42 +194,63 @@ rasterizing
|
||||
rendering
|
||||
Creating a new PDF from other data (such as an existing PDF).
|
||||
|
||||
|
||||
OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The renderer may be selected using ``--pdf-renderer``. The default is ``auto`` which lets OCRmyPDF select the renderer to use. Currently, ``auto`` always selects ``sandwich``.
|
||||
OCRmyPDF has these PDF renderers: ``sandwich`` and ``hocr``. The
|
||||
renderer may be selected using ``--pdf-renderer``. The default is
|
||||
``auto`` which lets OCRmyPDF select the renderer to use. Currently,
|
||||
``auto`` always selects ``sandwich``.
|
||||
|
||||
The ``sandwich`` renderer
|
||||
"""""""""""""""""""""""""
|
||||
-------------------------
|
||||
|
||||
The ``sandwich`` renderer uses Tesseract's new text-only PDF feature, which produces a PDF page that lays out the OCR in invisible text. This page is then "sandwiched" onto the original PDF page, allowing lossless application of OCR even to PDF pages that contain other vector objects.
|
||||
The ``sandwich`` renderer uses Tesseract's new text-only PDF feature,
|
||||
which produces a PDF page that lays out the OCR in invisible text. This
|
||||
page is then "sandwiched" onto the original PDF page, allowing lossless
|
||||
application of OCR even to PDF pages that contain other vector objects.
|
||||
|
||||
Currently this is the best renderer for most uses, however it is implemented in Tesseract so OCRmyPDF cannot influence it. Currently some problematic PDF viewers like Mozilla PDF.js and macOS Preview have problems with segmenting its text output, and mightrunseveralwordstogether.
|
||||
Currently this is the best renderer for most uses, however it is
|
||||
implemented in Tesseract so OCRmyPDF cannot influence it. Currently some
|
||||
problematic PDF viewers like Mozilla PDF.js and macOS Preview have
|
||||
problems with segmenting its text output, and
|
||||
mightrunseveralwordstogether.
|
||||
|
||||
When image preprocessing features like ``--deskew`` are used, the original PDF will be rendered as a full page and the OCR layer will be placed on top.
|
||||
When image preprocessing features like ``--deskew`` are used, the
|
||||
original PDF will be rendered as a full page and the OCR layer will be
|
||||
placed on top.
|
||||
|
||||
The ``hocr`` renderer
|
||||
"""""""""""""""""""""
|
||||
---------------------
|
||||
|
||||
The ``hocr`` renderer works with older versions of Tesseract. The image layer is copied from the original PDF page if possible, avoiding potentially lossy transcoding or loss of other PDF information. If preprocessing is specified, then the image layer is a new PDF.
|
||||
The ``hocr`` renderer works with older versions of Tesseract. The image
|
||||
layer is copied from the original PDF page if possible, avoiding
|
||||
potentially lossy transcoding or loss of other PDF information. If
|
||||
preprocessing is specified, then the image layer is a new PDF.
|
||||
|
||||
Unlike ``sandwich`` this renderer is implemented within OCRmyPDF; anyone looking to customize how OCR is presented should look here. A major disadvantage of this renderer is it not capable of correctly handling text outside the Latin alphabet. Pull requests to improve the situation are welcome.
|
||||
Unlike ``sandwich`` this renderer is implemented within OCRmyPDF; anyone
|
||||
looking to customize how OCR is presented should look here. A major
|
||||
disadvantage of this renderer is it not capable of correctly handling
|
||||
text outside the Latin alphabet. Pull requests to improve the situation
|
||||
are welcome.
|
||||
|
||||
Currently, this renderer has the best compatibility with Mozilla's PDF.js viewer.
|
||||
Currently, this renderer has the best compatibility with Mozilla's
|
||||
PDF.js viewer.
|
||||
|
||||
This works in all versions of Tesseract.
|
||||
|
||||
The ``tesseract`` renderer
|
||||
""""""""""""""""""""""""""
|
||||
--------------------------
|
||||
|
||||
The ``tesseract`` renderer was removed. OCRmyPDF's new approach to text layer grafting makes it functionally equivalent to ``sandwich``.
|
||||
The ``tesseract`` renderer was removed. OCRmyPDF's new approach to text
|
||||
layer grafting makes it functionally equivalent to ``sandwich``.
|
||||
|
||||
Return code policy
|
||||
------------------
|
||||
==================
|
||||
|
||||
OCRmyPDF writes all messages to ``stderr``. ``stdout`` is reserved for piping
|
||||
output files. ``stdin`` is reserved for piping input files.
|
||||
OCRmyPDF writes all messages to ``stderr``. ``stdout`` is reserved for
|
||||
piping output files. ``stdin`` is reserved for piping input files.
|
||||
|
||||
The return codes generated by the OCRmyPDF are considered part of the stable
|
||||
user interface. They may be imported from ``ocrmypdf.exceptions``.
|
||||
The return codes generated by the OCRmyPDF are considered part of the
|
||||
stable user interface. They may be imported from
|
||||
``ocrmypdf.exceptions``.
|
||||
|
||||
.. list-table:: Return codes
|
||||
:widths: 5 35 60
|
||||
@@ -218,22 +301,44 @@ user interface. They may be imported from ``ocrmypdf.exceptions``.
|
||||
|
||||
|
||||
Debugging the intermediate files
|
||||
--------------------------------
|
||||
================================
|
||||
|
||||
OCRmyPDF normally saves its intermediate results to a temporary folder and deletes this folder when it exits, whether it succeeded or failed.
|
||||
OCRmyPDF normally saves its intermediate results to a temporary folder
|
||||
and deletes this folder when it exits, whether it succeeded or failed.
|
||||
|
||||
If the ``-k`` argument is issued on the command line, OCRmyPDF will keep the temporary folder and print the location, whether it succeeded or failed (provided the Python interpreter did not crash). An example message is:
|
||||
If the ``-k`` argument is issued on the command line, OCRmyPDF will keep
|
||||
the temporary folder and print the location, whether it succeeded or
|
||||
failed (provided the Python interpreter did not crash). An example
|
||||
message is:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
Temporary working files saved at:
|
||||
/tmp/com.github.ocrmypdf.u20wpz07
|
||||
Temporary working files retained at:
|
||||
/tmp/ocrmypdf.io.u20wpz07
|
||||
|
||||
The organization of this folder is an implementation detail and subject to change between releases. However the general organization is that working files on a per page basis have the page number as a prefix (starting with page 1), an infix indicates the processing stage, and a suffix indicates the file type. Some important files include:
|
||||
The organization of this folder is an implementation detail and subject
|
||||
to change between releases. However the general organization is that
|
||||
working files on a per page basis have the page number as a prefix
|
||||
(starting with page 1), an infix indicates the processing stage, and a
|
||||
suffix indicates the file type. Some important files include:
|
||||
|
||||
* ``.page.png`` - what the input page looks like
|
||||
* ``.image`` - the image we will show the user if we are in a mode that changes the final appearance; may be in one of several image formats
|
||||
* ``.text.pdf`` - the OCR file; this will load as a blank page but should have visible text if checked with a tool like pdftotext or pdfminder.six
|
||||
* ``.ocr.png`` - the file that is sent to Tesseract for OCR; depending on arguments this may differ from the presentation image
|
||||
* ``layers.rendered.pdf`` - the composite PDF, before metadata repair and optimization
|
||||
* ``images/*`` - images extracted during the optimization process; here the prefix indicates a PDF object ID not a page number
|
||||
- ``_rasterize.png`` - what the input page looks like
|
||||
- ``_ocr.png`` - the file that is sent to Tesseract for OCR; depending
|
||||
on arguments this may differ from the presentation image
|
||||
- ``_pp_deskew.png`` - the image, after deskewing
|
||||
- ``_pp_clean.png`` - the image, after cleaning with unpaper
|
||||
- ``_ocr_tess.pdf`` - the OCR file; appears as a blank page with invisible
|
||||
text embedded
|
||||
- ``_ocr_tess.txt`` - the OCR text (not necessarily all text on the page,
|
||||
if the page is mixed format)
|
||||
- ``fix_docinfo.pdf`` - a temporary file created to fix the PDF DocumentInfo
|
||||
data structure
|
||||
- ``graft_layers.pdf`` - the rendered PDF with OCR layers grafted on
|
||||
- ``pdfa.pdf`` - ``graft_layers.pdf`` after conversion to PDF/A
|
||||
- ``pdfa.ps`` - a PostScript file used by Ghostscript for PDF/A conversion
|
||||
- ``optimize.pdf`` - the PDF generated before optimization
|
||||
- ``optimize.out.pdf`` - the PDF generated by optimization
|
||||
- ``origin`` - the input file
|
||||
- ``origin.pdf`` - the input file or the input image converted to PDF
|
||||
- ``images/*`` - images extracted during the optimization process; here
|
||||
the prefix indicates a PDF object ID not a page number
|
||||
|
||||
117
docs/api.rst
Normal file
117
docs/api.rst
Normal file
@@ -0,0 +1,117 @@
|
||||
======================
|
||||
Using the OCRmyPDF API
|
||||
======================
|
||||
|
||||
OCRmyPDF originated as a command line program and continues to have this
|
||||
legacy, but parts of it can be imported and used in other Python
|
||||
applications.
|
||||
|
||||
Some applications may want to consider running ocrmypdf from a
|
||||
subprocess call anyway, as this provides isolation of its activities.
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
OCRmyPDF one high-level function to run its main engine from an
|
||||
application. The parameters are symmetric to the command line arguments
|
||||
and largely have the same functions.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
import ocrmypdf
|
||||
|
||||
if __name__ == '__main__': # To ensure correct behavior on Windows and macOS
|
||||
ocrmypdf.ocr('input.pdf', 'output.pdf', deskew=True)
|
||||
|
||||
With a few exceptions, all of the command line arguments are available
|
||||
and may be passed as equivalent keywords.
|
||||
|
||||
A few differences are that ``verbose`` and ``quiet`` are not available.
|
||||
Instead, output should be managed by configuring logging.
|
||||
|
||||
Parent process requirements
|
||||
---------------------------
|
||||
|
||||
The :func:`ocrmypdf.ocr` function runs OCRmyPDF similar to command line
|
||||
execution. To do this, it will:
|
||||
|
||||
- create a monitoring thread
|
||||
- create worker processes (on Linux, forking itself; on Windows and macOS, by
|
||||
spawning)
|
||||
- manage the signal flags of its worker processes
|
||||
- execute other subprocesses (forking and executing other programs)
|
||||
|
||||
The Python process that calls ``ocrmypdf.ocr()`` must be sufficiently
|
||||
privileged to perform these actions.
|
||||
|
||||
There is no currently no option to manage how jobs are scheduled other
|
||||
than the argument ``jobs=`` which will limit the number of worker
|
||||
processes.
|
||||
|
||||
Creating a child process to call ``ocrmypdf.ocr()`` is suggested. That
|
||||
way your application will survive and remain interactive even if
|
||||
OCRmyPDF fails for any reason.
|
||||
|
||||
Programs that call ``ocrmypdf.ocr()`` should also install a SIGBUS signal
|
||||
handler (except on Windows), to raise an exception if access to a memory
|
||||
mapped file fails. OCRmyPDF may use memory mapping.
|
||||
|
||||
``ocrmypdf.ocr()`` will take a threading lock to prevent multiple runs of itself
|
||||
in the same Python interpreter process. This is not thread-safe, because of how
|
||||
OCRmyPDF's plugins and Python's library import system work. If you need to parallelize
|
||||
OCRmyPDF, use processes.
|
||||
|
||||
.. warning::
|
||||
|
||||
On Windows and macOS, the script that calls ``ocrmypdf.ocr()`` must be
|
||||
protected by an "ifmain" guard (``if __name__ == '__main__'``). If you do
|
||||
not take at least one of these steps, process semantics will prevent
|
||||
OCRmyPDF from working correctly.
|
||||
|
||||
Logging
|
||||
-------
|
||||
|
||||
OCRmyPDF will log under loggers named ``ocrmypdf``. In addition, it
|
||||
imports ``pdfminer`` and ``PIL``, both of which post log messages under
|
||||
those logging namespaces.
|
||||
|
||||
You can configure the logging as desired for your application or call
|
||||
:func:`ocrmypdf.configure_logging` to configure logging the same way
|
||||
OCRmyPDF itself does. The command line parameters such as ``--quiet``
|
||||
and ``--verbose`` have no equivalents in the API; you must use the
|
||||
provided configuration function or do configuration in a way that suits
|
||||
your use case.
|
||||
|
||||
Progress monitoring
|
||||
-------------------
|
||||
|
||||
OCRmyPDF uses the ``tqdm`` package to implement its progress bars.
|
||||
:func:`ocrmypdf.configure_logging` will set up logging output to
|
||||
``sys.stderr`` in a way that is compatible with the display of the
|
||||
progress bar. Use ``ocrmypdf.ocr(...progress_bar=False)`` to disable
|
||||
the progress bar.
|
||||
|
||||
Exceptions
|
||||
----------
|
||||
|
||||
OCRmyPDF may throw standard Python exceptions, ``ocrmypdf.exceptions.*``
|
||||
exceptions, some exceptions related to multiprocessing, and
|
||||
``KeyboardInterrupt``. The parent process should provide an exception
|
||||
handler. OCRmyPDF will clean up its temporary files and worker processes
|
||||
automatically when an exception occurs.
|
||||
|
||||
Programs that call OCRmyPDF should consider trapping KeyboardInterrupt
|
||||
so that they allow OCR to terminate with the whole program terminating.
|
||||
|
||||
When OCRmyPDF succeeds conditionally, it returns an integer exit code.
|
||||
|
||||
Reference
|
||||
---------
|
||||
|
||||
.. autofunction:: ocrmypdf.ocr
|
||||
|
||||
.. autoclass:: ocrmypdf.Verbosity
|
||||
:members:
|
||||
:undoc-members:
|
||||
|
||||
.. autofunction:: ocrmypdf.configure_logging
|
||||
55
docs/apiref.rst
Normal file
55
docs/apiref.rst
Normal file
@@ -0,0 +1,55 @@
|
||||
=============
|
||||
API Reference
|
||||
=============
|
||||
|
||||
This page summarizes the rest of the public API. Generally speaking this
|
||||
should mainly of interest to plugin developers.
|
||||
|
||||
ocrmypdf
|
||||
========
|
||||
|
||||
.. autoclass:: ocrmypdf.PageContext
|
||||
:members:
|
||||
|
||||
.. autoclass:: ocrmypdf.PdfContext
|
||||
:members:
|
||||
|
||||
ocrmypdf.exceptions
|
||||
===================
|
||||
|
||||
.. automodule:: ocrmypdf.exceptions
|
||||
:members:
|
||||
:undoc-members:
|
||||
|
||||
ocrmypdf.helpers
|
||||
================
|
||||
|
||||
.. automodule:: ocrmypdf.helpers
|
||||
:members:
|
||||
:noindex: deprecated
|
||||
|
||||
.. autodecorator:: deprecated
|
||||
|
||||
ocrmypdf.hocrtransform
|
||||
======================
|
||||
|
||||
.. automodule:: ocrmypdf.hocrtransform
|
||||
:members:
|
||||
|
||||
ocrmypdf.pdfa
|
||||
=============
|
||||
|
||||
.. automodule:: ocrmypdf.pdfa
|
||||
:members:
|
||||
|
||||
ocrmypdf.quality
|
||||
================
|
||||
|
||||
.. automodule:: ocrmypdf.quality
|
||||
:members:
|
||||
|
||||
ocrmypdf.subprocess
|
||||
===================
|
||||
|
||||
.. automodule:: ocrmypdf.subprocess
|
||||
:members:
|
||||
356
docs/batch.rst
356
docs/batch.rst
@@ -1,225 +1,217 @@
|
||||
================
|
||||
Batch processing
|
||||
================
|
||||
|
||||
This article provides information about running OCRmyPDF on multiple files or configuring it as a service triggered by file system events.
|
||||
This article provides information about running OCRmyPDF on multiple
|
||||
files or configuring it as a service triggered by file system events.
|
||||
|
||||
Batch jobs
|
||||
----------
|
||||
==========
|
||||
|
||||
Consider using the excellent `GNU Parallel <https://www.gnu.org/software/parallel/>`_ to apply OCRmyPDF to multiple files at once.
|
||||
Consider using the excellent `GNU
|
||||
Parallel <https://www.gnu.org/software/parallel/>`__ to apply OCRmyPDF
|
||||
to multiple files at once.
|
||||
|
||||
Both ``parallel`` and ``ocrmypdf`` will try to use all available processors. To maximize parallelism without overloading your system with processes, consider using ``parallel -j 2`` to limit parallel to running two jobs at once.
|
||||
Both ``parallel`` and ``ocrmypdf`` will try to use all available
|
||||
processors. To maximize parallelism without overloading your system with
|
||||
processes, consider using ``parallel -j 2`` to limit parallel to running
|
||||
two jobs at once.
|
||||
|
||||
This command will run all ocrmypdf all files named ``*.pdf`` in the current directory and write them to the previous created ``output/`` folder. It will not search subdirectories.
|
||||
This command will run all ocrmypdf all files named ``*.pdf`` in the
|
||||
current directory and write them to the previous created ``output/``
|
||||
folder. It will not search subdirectories.
|
||||
|
||||
The ``--tag`` argument tells parallel to print the filename as a prefix whenever a message is printed, so that one can trace any errors to the file that produced them.
|
||||
The ``--tag`` argument tells parallel to print the filename as a prefix
|
||||
whenever a message is printed, so that one can trace any errors to the
|
||||
file that produced them.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
parallel --tag -j 2 ocrmypdf '{}' 'output/{}' ::: *.pdf
|
||||
parallel --tag -j 2 ocrmypdf '{}' 'output/{}' ::: *.pdf
|
||||
|
||||
OCRmyPDF automatically repairs PDFs before parsing and gathering information from them.
|
||||
OCRmyPDF automatically repairs PDFs before parsing and gathering
|
||||
information from them.
|
||||
|
||||
Directory trees
|
||||
---------------
|
||||
===============
|
||||
|
||||
This will walk through a directory tree and run OCR on all files in place, printing the output in a way that makes
|
||||
This will walk through a directory tree and run OCR on all files in
|
||||
place, printing the output in a way that makes
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
find . -printf '%p' -name '*.pdf' -exec ocrmypdf '{}' '{}' \;
|
||||
|
||||
Alternatively, with a docker container (mounts a volume to the container where the PDFs are stored):
|
||||
find . -printf '%p' -name '*.pdf' -exec ocrmypdf '{}' '{}' \;
|
||||
|
||||
Alternatively, with a docker container (mounts a volume to the container
|
||||
where the PDFs are stored):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
find . -printf '%p' -name '*.pdf' -exec docker run --rm -v <host dir>:<container dir> jbarlow83/ocrmypdf-alpine '<container dir>/{}' '<container dir>/{}' \;
|
||||
find . -printf '%p' -name '*.pdf' -exec docker run --rm -v <host dir>:<container dir> jbarlow83/ocrmypdf '<container dir>/{}' '<container dir>/{}' \;
|
||||
|
||||
This only runs one ``ocrmypdf`` process at a time. This variation uses ``find`` to create a directory list and ``parallel`` to parallelize runs of ``ocrmypdf``, again updating files in place.
|
||||
This only runs one ``ocrmypdf`` process at a time. This variation uses
|
||||
``find`` to create a directory list and ``parallel`` to parallelize runs
|
||||
of ``ocrmypdf``, again updating files in place.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
find . -name '*.pdf' | parallel --tag -j 2 ocrmypdf '{}' '{}'
|
||||
find . -name '*.pdf' | parallel --tag -j 2 ocrmypdf '{}' '{}'
|
||||
|
||||
In a Windows batch file, use
|
||||
|
||||
.. code-block:: bat
|
||||
|
||||
for /r %%f in (*.pdf) do ocrmypdf %%f %%f
|
||||
|
||||
Sample script
|
||||
"""""""""""""
|
||||
-------------
|
||||
|
||||
This user contributed script also provides an example of batch processing.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
#!/usr/bin/env python3
|
||||
# Walk through directory tree, replacing all files with OCR'd version
|
||||
# Contributed by DeliciousPickle@github
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
script_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
print(script_dir + '/ocr-tree.py: Start')
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
start_dir = sys.argv[1]
|
||||
else:
|
||||
start_dir = '.'
|
||||
|
||||
if len(sys.argv) > 2:
|
||||
log_file = sys.argv[2]
|
||||
else:
|
||||
log_file = script_dir + '/ocr-tree.log'
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format='%(asctime)s %(message)s',
|
||||
filename=log_file, filemode='w')
|
||||
|
||||
for dir_name, subdirs, file_list in os.walk(start_dir):
|
||||
logging.info('\n')
|
||||
logging.info(dir_name + '\n')
|
||||
os.chdir(dir_name)
|
||||
for filename in file_list:
|
||||
file_ext = os.path.splitext(filename)[1]
|
||||
if file_ext == '.pdf':
|
||||
full_path = dir_name + '/' + filename
|
||||
print(full_path)
|
||||
cmd = ["ocrmypdf", "--deskew", filename, filename]
|
||||
logging.info(cmd)
|
||||
proc = subprocess.run(
|
||||
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
result = proc.stdout
|
||||
if proc.returncode == 6:
|
||||
print("Skipped document because it already contained text")
|
||||
elif proc.returncode == 0:
|
||||
print("OCR complete")
|
||||
logging.info(result)
|
||||
|
||||
API
|
||||
"""
|
||||
|
||||
OCRmyPDF is currently supported as a command line interface. This means that even if you are using OCRmyPDF in a Python script, you should run it in a subprocess rather importing the ocrmypdf package.
|
||||
|
||||
The reason for this limitation is that the `ruffus <https://github.com/bunbun/ruffus/>`_ library that OCRmyPDF depends on is unfortunately not reentrant. OCRmyPDF works by defining each operation it does as a ruffus task that takes one or more files as input and generates one or more files as output. As such ruffus is fairly fundamental.
|
||||
|
||||
(If you find individual functions implemented in OCRmyPDF useful (such as ``ocrmypdf.pdfinfo``), you can use these if you wish to.)
|
||||
This user contributed script also provides an example of batch
|
||||
processing.
|
||||
|
||||
.. literalinclude:: ../misc/batch.py
|
||||
:caption: misc/batch.py
|
||||
|
||||
Synology DiskStations
|
||||
"""""""""""""""""""""
|
||||
|
||||
Synology DiskStations (Network Attached Storage devices) can run the Docker image of OCRmyPDF if the Synology `Docker package <https://www.synology.com/en-global/dsm/packages/Docker>`_ is installed. Attached is a script to address particular quirks of using OCRmyPDF on one of these devices.
|
||||
|
||||
This is only possible for x86-based Synology products. Some Synology products use ARM or Power processors and do not support Docker. Further adjustments might be needed to deal with the Synology's relatively limited CPU and RAM.
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
#!/bin/env python3
|
||||
# Contributed by github.com/Enantiomerie
|
||||
|
||||
# script needs 2 arguments
|
||||
# 1. source dir with *.pdf - default is location of script
|
||||
# 2. move dir where *.pdf and *_OCR.pdf are moved to
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import shutil
|
||||
|
||||
script_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
timestamp = time.strftime("%Y-%m-%d-%H%M_")
|
||||
log_file = script_dir + '/' + timestamp + 'ocrmypdf.log'
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s', filename=log_file, filemode='w')
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
start_dir = sys.argv[1]
|
||||
else:
|
||||
start_dir = '.'
|
||||
|
||||
for dir_name, subdirs, file_list in os.walk(start_dir):
|
||||
logging.info('\n')
|
||||
logging.info(dir_name + '\n')
|
||||
os.chdir(dir_name)
|
||||
for filename in file_list:
|
||||
file_ext = os.path.splitext(filename)[1]
|
||||
if file_ext == '.pdf':
|
||||
full_path = dir_name + '/' + filename
|
||||
file_noext = os.path.splitext(filename)[0]
|
||||
timestamp_OCR = time.strftime("%Y-%m-%d-%H%M_OCR_")
|
||||
filename_OCR = timestamp_OCR + file_noext + '.pdf'
|
||||
docker_mount = dir_name + ':/home/docker'
|
||||
# create string for pdf processing
|
||||
# diskstation needs a user:group docker:docker. find uid:gid of your diskstation docker:docker with id docker.
|
||||
# use this uid:gid in -u flag
|
||||
# rw rights for docker:docker at source dir are also necessary
|
||||
# the script is processed as root user via chron
|
||||
cmd = ['docker', 'run', '--rm', '-v', docker_mount, '-u=1030:65538', 'jbarlow83/ocrmypdf', , '--deskew' , filename, filename_OCR]
|
||||
logging.info(cmd)
|
||||
proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
result = proc.stdout.read()
|
||||
logging.info(result)
|
||||
full_path_OCR = dir_name + '/' + filename_OCR
|
||||
os.chmod(full_path_OCR, 0o666)
|
||||
os.chmod(full_path, 0o666)
|
||||
full_path_OCR_archive = sys.argv[2]
|
||||
full_path_archive = sys.argv[2] + '/no_ocr'
|
||||
shutil.move(full_path_OCR,full_path_OCR_archive)
|
||||
shutil.move(full_path, full_path_archive)
|
||||
logging.info('Finished.\n')
|
||||
|
||||
Huge batch jobs
|
||||
"""""""""""""""
|
||||
|
||||
If you have thousands of files to work with, contact the author. Consulting work related to OCRmyPDF helps fund this open source project and all inquiries are appreciated.
|
||||
|
||||
Hot (watched) folders
|
||||
---------------------
|
||||
|
||||
To set up a "hot folder" that will trigger OCR for every file inserted, use a program like Python `watchdog <https://pypi.python.org/pypi/watchdog>`_ (supports all major OS).
|
||||
Synology DiskStations (Network Attached Storage devices) can run the
|
||||
Docker image of OCRmyPDF if the Synology `Docker
|
||||
package <https://www.synology.com/en-global/dsm/packages/Docker>`__ is
|
||||
installed. Attached is a script to address particular quirks of using
|
||||
OCRmyPDF on one of these devices.
|
||||
|
||||
One could then configure a scanner to automatically place scanned files in a hot folder, so that they will be queued for OCR and copied to the destination.
|
||||
This is only possible for x86-based Synology products. Some Synology
|
||||
products use ARM or Power processors and do not support Docker. Further
|
||||
adjustments might be needed to deal with the Synology's relatively
|
||||
limited CPU and RAM.
|
||||
|
||||
.. code-block:: bash
|
||||
.. literalinclude:: ../misc/synology.py
|
||||
:caption: misc/synology.py - Sample script for Synology DiskStations
|
||||
|
||||
pip install watchdog
|
||||
|
||||
watchdog installs the command line program ``watchmedo``, which can be told to run ``ocrmypdf`` on any .pdf added to the current directory (``.``) and place the result in the previously created ``out/`` folder.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
cd hot-folder
|
||||
mkdir out
|
||||
watchmedo shell-command \
|
||||
--patterns="*.pdf" \
|
||||
--ignore-directories \
|
||||
--command='ocrmypdf "${watch_src_path}" "out/${watch_src_path}" ' \
|
||||
. # don't forget the final dot
|
||||
|
||||
For more complex behavior you can write a Python script around to use the watchdog API.
|
||||
|
||||
On file servers, you could configure watchmedo as a system service so it will run all the time.
|
||||
|
||||
Caveats
|
||||
"""""""
|
||||
|
||||
* ``watchmedo`` may not work properly on a networked file system, depending on the capabilities of the file system client and server.
|
||||
* This simple recipe does not filter for the type of file system event, so file copies, deletes and moves, and directory operations, will all be sent to ocrmypdf, producing errors in several cases. Disable your watched folder if you are doing anything other than copying files to it.
|
||||
* If the source and destination directory are the same, watchmedo may create an infinite loop.
|
||||
* On BSD, FreeBSD and older versions of macOS, you may need to increase the number of file descriptors to monitor more files, using ``ulimit -n 1024`` to watch a folder of up to 1024 files.
|
||||
|
||||
Alternatives
|
||||
""""""""""""
|
||||
|
||||
* `Watchman <https://facebook.github.io/watchman/>`_ is a more powerful alternative to ``watchmedo``.
|
||||
|
||||
macOS Automator
|
||||
Huge batch jobs
|
||||
---------------
|
||||
|
||||
You can use the Automator app with macOS, to create a Workflow or Quick Action. Use a *Run Shell Script* action in your workflow. In the context of Automator, the ``PATH`` may be set differently your Terminal's ``PATH``; you may need to explicitly set the PATH to include ``ocrmypdf``. The following example may serve as a starting point:
|
||||
If you have thousands of files to work with, contact the author.
|
||||
Consulting work related to OCRmyPDF helps fund this open source project
|
||||
and all inquiries are appreciated.
|
||||
|
||||
.. image:: images/macos-workflow.png
|
||||
:alt: Example macOS Automator script
|
||||
Hot (watched) folders
|
||||
=====================
|
||||
|
||||
Watched folders with watcher.py
|
||||
-------------------------------
|
||||
|
||||
OCRmyPDF has a folder watcher called watcher.py, which is currently included in source
|
||||
distributions but not part of the main program. It may be used natively or may run
|
||||
in a Docker container. Native instances tend to give better performance. watcher.py
|
||||
works on all platforms.
|
||||
|
||||
Users may need to customize the script to meet their requirements.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip3 install -r requirements/watcher.txt
|
||||
|
||||
env OCR_INPUT_DIRECTORY=/mnt/input-pdfs \
|
||||
OCR_OUTPUT_DIRECTORY=/mnt/output-pdfs \
|
||||
OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
|
||||
python3 watcher.py
|
||||
|
||||
.. csv-table:: watcher.py environment variables
|
||||
:header: "Environment variable", "Description"
|
||||
:widths: 50, 50
|
||||
|
||||
"OCR_INPUT_DIRECTORY", "Set input directory to monitor (recursive)"
|
||||
"OCR_OUTPUT_DIRECTORY", "Set output directory (should not be under input)"
|
||||
"OCR_ON_SUCCESS_DELETE", "This will delete the input file if the exit code is 0 (OK)"
|
||||
"OCR_OUTPUT_DIRECTORY_YEAR_MONTH", "This will place files in the output in ``{output}/{year}/{month}/{filename}``"
|
||||
"OCR_DESKEW", "Apply deskew to crooked input PDFs"
|
||||
"OCR_JSON_SETTINGS", "A JSON string specifying any other arguments for ``ocrmypdf.ocr``, e.g. ``'OCR_JSON_SETTINGS={""rotate_pages"": true}'``."
|
||||
"OCR_POLL_NEW_FILE_SECONDS", "Polling interval"
|
||||
"OCR_LOGLEVEL", "Level of log messages to report"
|
||||
|
||||
One could configure a networked scanner or scanning computer to drop files in the
|
||||
watched folder.
|
||||
|
||||
Watched folders with Docker
|
||||
---------------------------
|
||||
|
||||
The watcher service is included in the OCRmyPDF Docker image. To run it:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run \
|
||||
-v <path to files to convert>:/input \
|
||||
-v <path to store results>:/output \
|
||||
-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
|
||||
-e OCR_ON_SUCCESS_DELETE=1 \
|
||||
-e OCR_DESKEW=1 \
|
||||
-e PYTHONUNBUFFERED=1 \
|
||||
-it --entrypoint python3 \
|
||||
jbarlow83/ocrmypdf \
|
||||
watcher.py
|
||||
|
||||
This service will watch for a file that matches ``/input/\*.pdf`` and will
|
||||
convert it to a OCRed PDF in ``/output/``. The parameters to this image are:
|
||||
|
||||
.. csv-table:: watcher.py parameters for Docker
|
||||
:header: "Parameter", "Description"
|
||||
:widths: 50, 50
|
||||
|
||||
"``-v <path to files to convert>:/input``", "Files placed in this location will be OCRed"
|
||||
"``-v <path to store results>:/output``", "This is where OCRed files will be stored"
|
||||
"``-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1``", "Define environment variable OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1"
|
||||
"``-e OCR_ON_SUCCESS_DELETE=1``", "Define environment variable"
|
||||
"``-e OCR_DESKEW=1``", "Define environment variable"
|
||||
"``-e PYTHONBUFFERED=1``", "This will force STDOUT to be unbuffered and allow you to see messages in docker logs"
|
||||
|
||||
This service relies on polling to check for changes to the filesystem. It
|
||||
may not be suitable for some environments, such as filesystems shared on a
|
||||
slow network.
|
||||
|
||||
A configuration manager such as Docker Compose could be used to ensure that the
|
||||
service is always available.
|
||||
|
||||
.. literalinclude:: ../misc/docker-compose.example.yml
|
||||
:language: yaml
|
||||
:caption: misc/docker-compose.example.yml
|
||||
|
||||
Caveats
|
||||
-------
|
||||
|
||||
- ``watchmedo`` may not work properly on a networked file system,
|
||||
depending on the capabilities of the file system client and server.
|
||||
- This simple recipe does not filter for the type of file system event,
|
||||
so file copies, deletes and moves, and directory operations, will all
|
||||
be sent to ocrmypdf, producing errors in several cases. Disable your
|
||||
watched folder if you are doing anything other than copying files to
|
||||
it.
|
||||
- If the source and destination directory are the same, watchmedo may
|
||||
create an infinite loop.
|
||||
- On BSD, FreeBSD and older versions of macOS, you may need to increase
|
||||
the number of file descriptors to monitor more files, using
|
||||
``ulimit -n 1024`` to watch a folder of up to 1024 files.
|
||||
|
||||
Alternatives
|
||||
------------
|
||||
|
||||
- On Linux, `systemd user services <https://wiki.archlinux.org/index.php/Systemd/User>`__
|
||||
can be configured to automatically perform OCR on a collection of files.
|
||||
|
||||
- `Watchman <https://facebook.github.io/watchman/>`__ is a more
|
||||
powerful alternative to ``watchmedo``.
|
||||
|
||||
macOS Automator
|
||||
===============
|
||||
|
||||
You can use the Automator app with macOS, to create a Workflow or Quick
|
||||
Action. Use a *Run Shell Script* action in your workflow. In the context
|
||||
of Automator, the ``PATH`` may be set differently your Terminal's
|
||||
``PATH``; you may need to explicitly set the PATH to include
|
||||
``ocrmypdf``. The following example may serve as a starting point:
|
||||
|
||||
.. figure:: images/macos-workflow.png
|
||||
:alt: Example macOS Automator workflow
|
||||
|
||||
You may customize the command sent to ocrmypdf.
|
||||
|
||||
14
docs/conf.py
14
docs/conf.py
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# ocrmypdf documentation build configuration file, created by
|
||||
# sphinx-quickstart on Sun Sep 4 14:29:43 2016.
|
||||
@@ -21,6 +20,8 @@
|
||||
# import sys
|
||||
# sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
"""isort:skip_file"""
|
||||
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
@@ -30,9 +31,9 @@
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
# 'sphinx.ext.mathjax',
|
||||
]
|
||||
extensions = ['sphinx.ext.napoleon']
|
||||
|
||||
napoleon_use_rtype = False
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
@@ -53,7 +54,7 @@ master_doc = 'index'
|
||||
# General information about the project.
|
||||
project = 'ocrmypdf'
|
||||
copyright = (
|
||||
'2019, James R. Barlow. Licensed under Creative Commons Attribution-ShareAlike 4.0.'
|
||||
'2020, James R. Barlow. Licensed under Creative Commons Attribution-ShareAlike 4.0.'
|
||||
)
|
||||
author = 'James R. Barlow'
|
||||
|
||||
@@ -92,6 +93,7 @@ from pkg_resources import get_distribution, DistributionNotFound
|
||||
release = get_distribution('ocrmypdf').version
|
||||
version = '.'.join(release.split('.')[:2])
|
||||
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#
|
||||
@@ -176,7 +178,7 @@ html_theme_options = {'display_version': False}
|
||||
# The name of an image file (relative to this directory) to place at the top
|
||||
# of the sidebar.
|
||||
#
|
||||
# html_logo = None
|
||||
# html_logo = "images/logo.svg" # looks bad
|
||||
|
||||
# The name of an image file (relative to this directory) to use as a favicon of
|
||||
# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
||||
|
||||
67
docs/contributing.rst
Normal file
67
docs/contributing.rst
Normal file
@@ -0,0 +1,67 @@
|
||||
=======================
|
||||
Contributing guidelines
|
||||
=======================
|
||||
|
||||
Contributions are welcome!
|
||||
|
||||
Big changes
|
||||
===========
|
||||
|
||||
Please open a new issue to discuss or propose a major change. Not only is it fun
|
||||
to discuss big ideas, but we might save each other's time too. Perhaps some of the
|
||||
work you're contemplating is already half-done in a development branch.
|
||||
|
||||
Code style
|
||||
==========
|
||||
|
||||
We use PEP8, ``black`` for code formatting and ``isort`` for import sorting. The
|
||||
settings for these programs are in ``pyproject.toml`` and ``setup.cfg``. Pull
|
||||
requests should follow the style guide. One difference we use from "black" style
|
||||
is that strings shown to the user are always in double quotes (``"``) and strings
|
||||
for internal uses are in single quotes (``'``).
|
||||
|
||||
Tests
|
||||
=====
|
||||
|
||||
New features should come with tests that confirm their correctness.
|
||||
|
||||
New Python dependencies
|
||||
=======================
|
||||
|
||||
If you are proposing a change that will require a new Python dependency, we
|
||||
prefer dependencies that are already packaged by Debian or Red Hat. This makes
|
||||
life much easier for our downstream package maintainers.
|
||||
|
||||
Python dependencies must also be license-compatible. GPLv3 or AGPLv3 are likely
|
||||
incompatible with the project's license, but LGPLv3 is compatible.
|
||||
|
||||
New non-Python dependencies
|
||||
===========================
|
||||
|
||||
OCRmyPDF uses several external programs (Tesseract, Ghostscript and others) for
|
||||
its functionality. In general we prefer to avoid adding new external programs.
|
||||
|
||||
Style guide: Is it OCRmyPDF or ocrmypdf?
|
||||
========================================
|
||||
|
||||
The program/project is OCRmyPDF and the name of the executable or library is ocrmypdf.
|
||||
|
||||
Known ports/packagers
|
||||
=====================
|
||||
|
||||
OCRmyPDF has been ported to many platforms already. If you are interesting in
|
||||
porting to a new platform, check with
|
||||
`Repology <https://repology.org/projects/?search=ocrmypdf>`__ to see the status
|
||||
of that platform.
|
||||
|
||||
Packager maintainers, please ensure that the command line completion scripts in
|
||||
``misc/`` are installed.
|
||||
|
||||
Copyright and license
|
||||
=====================
|
||||
|
||||
For contributions over 10 lines of code, please include your name to list of
|
||||
copyright holders for that file. The core program is licensed under MPL-2.0,
|
||||
test files and documentation under CC-BY-SA 4.0, and miscellaneous files under
|
||||
MIT. Please contribute code only that you wrote and you have the permission to
|
||||
contribute or license to us.
|
||||
@@ -1,11 +1,12 @@
|
||||
========
|
||||
Cookbook
|
||||
========
|
||||
|
||||
Basic examples
|
||||
--------------
|
||||
==============
|
||||
|
||||
Help!
|
||||
^^^^^
|
||||
-----
|
||||
|
||||
ocrmypdf has built-in help.
|
||||
|
||||
@@ -13,30 +14,29 @@ ocrmypdf has built-in help.
|
||||
|
||||
ocrmypdf --help
|
||||
|
||||
|
||||
Add an OCR layer and convert to PDF/A
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
-------------------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf input.pdf output.pdf
|
||||
|
||||
Add an OCR layer and output a standard PDF
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
------------------------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --output-type pdf input.pdf output.pdf
|
||||
|
||||
Create a PDF/A with all color and grayscale images converted to JPEG
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
--------------------------------------------------------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --output-type pdfa --pdfa-image-compression jpeg input.pdf output.pdf
|
||||
|
||||
Modify a file in place
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
----------------------
|
||||
|
||||
The file will only be overwritten if OCRmyPDF is successful.
|
||||
|
||||
@@ -45,48 +45,76 @@ The file will only be overwritten if OCRmyPDF is successful.
|
||||
ocrmypdf myfile.pdf myfile.pdf
|
||||
|
||||
Correct page rotation
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
---------------------
|
||||
|
||||
OCR will attempt to automatic correct the rotation of each page. This can help fix a scanning job that contains a mix of landscape and portrait pages.
|
||||
OCR will attempt to automatic correct the rotation of each page. This
|
||||
can help fix a scanning job that contains a mix of landscape and
|
||||
portrait pages.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --rotate-pages myfile.pdf myfile.pdf
|
||||
|
||||
You can increase (decrease) the parameter ``--rotate-pages-threshold`` to make page rotation more (less) aggressive.
|
||||
You can increase (decrease) the parameter ``--rotate-pages-threshold``
|
||||
to make page rotation more (less) aggressive. The threshold number is the ratio
|
||||
of how confidence the OCR engine is that the document image should be changed,
|
||||
compared to kept the same. The default value is quite conservative; on some files
|
||||
it may not attempt rotations at all unless it is very confident that the current
|
||||
rotation is wrong. A lower value of ``2.0`` will produce more rotations, and
|
||||
more false positives. Run with ``-v1`` to see the confidence level for each
|
||||
page to see if there may be a better value for your files.
|
||||
|
||||
If the page is "just a little off horizontal", like a crooked picture, then you want ``--deskew``. ``--rotate-pages`` is for when the cardinal angle is wrong.
|
||||
If the page is "just a little off horizontal", like a crooked picture,
|
||||
then you want ``--deskew``. ``--rotate-pages`` is for when the cardinal
|
||||
angle is wrong.
|
||||
|
||||
OCR languages other than English
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
--------------------------------
|
||||
|
||||
OCRmyPDF assumes the document is in English unless told otherwise. OCR quality may be poor if the wrong language is used.
|
||||
OCRmyPDF assumes the document is in English unless told otherwise. OCR
|
||||
quality may be poor if the wrong language is used.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf -l fra LeParisien.pdf LeParisien.pdf
|
||||
ocrmypdf -l eng+fra Bilingual-English-French.pdf Bilingual-English-French.pdf
|
||||
|
||||
Language packs must be installed for all languages specified. See :ref:`Installing additional language packs <lang-packs>`.
|
||||
Language packs must be installed for all languages specified. See
|
||||
:ref:`Installing additional language packs <lang-packs>`.
|
||||
|
||||
Unfortunately, the Tesseract OCR engine has no ability to detect the language when it is unknown.
|
||||
Unfortunately, the Tesseract OCR engine has no ability to detect the
|
||||
language when it is unknown.
|
||||
|
||||
Produce PDF and text file containing OCR text
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
---------------------------------------------
|
||||
|
||||
This produces a file named "output.pdf" and a companion text file named "output.txt".
|
||||
This produces a file named "output.pdf" and a companion text file named
|
||||
"output.txt".
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --sidecar output.txt input.pdf output.pdf
|
||||
|
||||
.. note::
|
||||
|
||||
The sidecar file contains the **OCR text** found by OCRmyPDF. If the document
|
||||
contains pages that already have text, that text will not appear in the
|
||||
sidecar. If the option ``--pages`` is used, only those pages on which OCR
|
||||
was performed will be included in the sidecar. If certain pages were skipped
|
||||
because of options like ``--skip-big`` or ``--tesseract-timeout``, those pages
|
||||
will not be in the sidecar.
|
||||
|
||||
To extract all text from a PDF, whether generated from OCR or otherwise,
|
||||
use a program like Poppler's ``pdftotext`` or ``pdfgrep``.
|
||||
|
||||
OCR images, not PDFs
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
--------------------
|
||||
|
||||
Option: use Tesseract
|
||||
"""""""""""""""""""""
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you are starting with images, you can just use Tesseract directly to convert images to PDFs:
|
||||
If you are starting with images, you can just use Tesseract directly to
|
||||
convert images to PDFs:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -97,62 +125,88 @@ If you are starting with images, you can just use Tesseract directly to convert
|
||||
# When there are multiple images
|
||||
tesseract text-file-containing-list-of-image-filenames.txt output-prefix pdf
|
||||
|
||||
Tesseract's PDF output is quite good – OCRmyPDF uses it internally, in some cases. However, OCRmyPDF has many features not available in Tesseract like image processing, metadata control, and PDF/A generation.
|
||||
Tesseract's PDF output is quite good – OCRmyPDF uses it internally, in
|
||||
some cases. However, OCRmyPDF has many features not available in
|
||||
Tesseract like image processing, metadata control, and PDF/A generation.
|
||||
|
||||
Option: use img2pdf
|
||||
"""""""""""""""""""
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
You can also use a program like `img2pdf <https://gitlab.mister-muffin.de/josch/img2pdf>`_ to convert your images to PDFs, and then pipe the results to run ocrmypdf. The ``-`` tells ocrmypdf to read standard input.
|
||||
You can also use a program like
|
||||
`img2pdf <https://gitlab.mister-muffin.de/josch/img2pdf>`__ to convert
|
||||
your images to PDFs, and then pipe the results to run ocrmypdf. The
|
||||
``-`` tells ocrmypdf to read standard input.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
img2pdf my-images*.jpg | ocrmypdf - myfile.pdf
|
||||
|
||||
``img2pdf`` is recommended because it does an excellent job at generating PDFs without transcoding images.
|
||||
``img2pdf`` is recommended because it does an excellent job at
|
||||
generating PDFs without transcoding images.
|
||||
|
||||
Option: use OCRmyPDF (single images only)
|
||||
"""""""""""""""""""""""""""""""""""""""""
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
For convenience, OCRmyPDF can also convert single images to PDFs on its own. If the resolution (dots per inch, DPI) of an image is not set or is incorrect, it can be overridden with ``--image-dpi``. (As 1 inch is 2.54 cm, 1 dpi = 0.39 dpcm).
|
||||
For convenience, OCRmyPDF can also convert single images to PDFs on its
|
||||
own. If the resolution (dots per inch, DPI) of an image is not set or is
|
||||
incorrect, it can be overridden with ``--image-dpi``. (As 1 inch is 2.54
|
||||
cm, 1 dpi = 0.39 dpcm).
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --image-dpi 300 image.png myfile.pdf
|
||||
|
||||
If you have multiple images, you must use ``img2pdf`` to convert the images to PDF.
|
||||
If you have multiple images, you must use ``img2pdf`` to convert the
|
||||
images to PDF.
|
||||
|
||||
Not recommended
|
||||
"""""""""""""""
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
We caution against using ImageMagick or Ghostscript to convert images to PDF, since they may transcode images or produce downsampled images, sometimes without warning.
|
||||
We caution against using ImageMagick or Ghostscript to convert images to
|
||||
PDF, since they may transcode images or produce downsampled images,
|
||||
sometimes without warning.
|
||||
|
||||
Image processing
|
||||
----------------
|
||||
================
|
||||
|
||||
OCRmyPDF perform some image processing on each page of a PDF, if desired. The same processing is applied to each page. It is suggested that the user review files after image processing as these commands might remove desirable content, especially from poor quality scans.
|
||||
OCRmyPDF perform some image processing on each page of a PDF, if
|
||||
desired. The same processing is applied to each page. It is suggested
|
||||
that the user review files after image processing as these commands
|
||||
might remove desirable content, especially from poor quality scans.
|
||||
|
||||
* ``--rotate-pages`` attempts to determine the correct orientation for each page and rotates the page if necessary.
|
||||
|
||||
* ``--remove-background`` attempts to detect and remove a noisy background from grayscale or color images. Monochrome images are ignored. This should not be used on documents that contain color photos as it may remove them.
|
||||
|
||||
* ``--deskew`` will correct pages were scanned at a skewed angle by rotating them back into place. Skew determination and correction is performed using `Postl's variance of line sums <http://www.leptonica.com/skew-measurement.html>`_ algorithm as implemented in `Leptonica <http://www.leptonica.com/index.html>`_.
|
||||
|
||||
* ``--clean`` uses `unpaper <https://www.flameeyes.eu/projects/unpaper>`_ to clean up pages before OCR, but does not alter the final output. This makes it less likely that OCR will try to find text in background noise.
|
||||
|
||||
* ``--clean-final`` uses unpaper to clean up pages before OCR and inserts the page into the final output. You will want to review each page to ensure that unpaper did not remove something important.
|
||||
|
||||
* ``--mask-barcodes`` will suppress any barcodes detected in a page image. Barcodes are known to confuse Tesseract OCR and interfere with the recognition of text on the same baseline as a barcode. The output file will contain the unaltered image of the barcode.
|
||||
- ``--rotate-pages`` attempts to determine the correct orientation for
|
||||
each page and rotates the page if necessary.
|
||||
- ``--remove-background`` attempts to detect and remove a noisy
|
||||
background from grayscale or color images. Monochrome images are
|
||||
ignored. This should not be used on documents that contain color
|
||||
photos as it may remove them.
|
||||
- ``--deskew`` will correct pages were scanned at a skewed angle by
|
||||
rotating them back into place. Skew determination and correction is
|
||||
performed using `Postl's variance of line
|
||||
sums <http://www.leptonica.org/skew-measurement.html>`__ algorithm as
|
||||
implemented in `Leptonica <http://www.leptonica.org/index.html>`__.
|
||||
- ``--clean`` uses
|
||||
`unpaper <https://www.flameeyes.eu/projects/unpaper>`__ to clean up
|
||||
pages before OCR, but does not alter the final output. This makes it
|
||||
less likely that OCR will try to find text in background noise.
|
||||
- ``--clean-final`` uses unpaper to clean up pages before OCR and
|
||||
inserts the page into the final output. You will want to review each
|
||||
page to ensure that unpaper did not remove something important.
|
||||
|
||||
.. note::
|
||||
|
||||
In many cases image processing will rasterize PDF pages as images, potentially losing quality.
|
||||
In many cases image processing will rasterize PDF pages as images,
|
||||
potentially losing quality.
|
||||
|
||||
.. warning::
|
||||
|
||||
``--clean-final`` and ``-remove-background`` may leave undesirable visual artifacts in some images where their algorithms have shortcomings. Files should be visually reviewed after using these options.
|
||||
``--clean-final`` and ``-remove-background`` may leave undesirable
|
||||
visual artifacts in some images where their algorithms have
|
||||
shortcomings. Files should be visually reviewed after using these
|
||||
options.
|
||||
|
||||
Example: OCR and correct document skew (crooked scan)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
-----------------------------------------------------
|
||||
|
||||
Deskew:
|
||||
|
||||
@@ -160,55 +214,118 @@ Deskew:
|
||||
|
||||
ocrmypdf --deskew input.pdf output.pdf
|
||||
|
||||
Image processing commands can be combined. The order in which options are given does not matter. OCRmyPDF always applies the steps of the image processing pipeline in the same order (rotate, remove background, deskew, clean).
|
||||
Image processing commands can be combined. The order in which options
|
||||
are given does not matter. OCRmyPDF always applies the steps of the
|
||||
image processing pipeline in the same order (rotate, remove background,
|
||||
deskew, clean).
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --deskew --clean --rotate-pages input.pdf output.pdf
|
||||
|
||||
|
||||
Don't actually OCR my PDF
|
||||
-------------------------
|
||||
=========================
|
||||
|
||||
If you set ``--tesseract-timeout 0`` OCRmyPDF will apply its image processing without performing OCR, if all you want to is to apply image processing or PDF/A conversion.
|
||||
If you set ``--tesseract-timeout 0`` OCRmyPDF will apply its image
|
||||
processing without performing OCR, if all you want to is to apply image
|
||||
processing or PDF/A conversion.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --tesseract-timeout=0 --remove-background input.pdf output.pdf
|
||||
|
||||
Optimize images without performing OCR
|
||||
--------------------------------------
|
||||
|
||||
You can also optimize all images without performing any OCR:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --tesseract-timeout=0 --optimize 3 --skip-text input.pdf output.pdf
|
||||
|
||||
Perform OCR only certain pages
|
||||
------------------------------
|
||||
|
||||
You can ask OCRmyPDF to only apply OCR to certain pages.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --pages 2,3,13-17 input.pdf output.pdf
|
||||
|
||||
Hyphens denote a range of pages and commas separate page numbers. If you prefer
|
||||
to use spaces, quote all of the page numbers: ``--pages '2, 3, 5, 7'``.
|
||||
|
||||
OCRmyPDF will warn if your list of page numbers contains duplicates or
|
||||
overlap pages. OCRmyPDF does not currently account for document page numbers,
|
||||
such as an introduction section of a book that uses Roman numerals. It simply
|
||||
counts the number of virtual pieces of paper since the start.
|
||||
|
||||
Regardless of the argument to ``--pages``, OCRmyPDF will optimize all pages in
|
||||
the file and convert it to PDF/A, unless you disable those options. In this
|
||||
example, we want to OCR only the title and otherwise change the PDF as little
|
||||
as possible:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --pages 1 --output-type pdf --optimize 0 input.pdf output.pdf
|
||||
|
||||
Redo existing OCR
|
||||
-----------------
|
||||
=================
|
||||
|
||||
To redo OCR on a file OCRed with other OCR software or a previous version of OCRmyPDF and/or Tesseract, you may use the ``--redo-ocr`` argument. (Normally, OCRmyPDF will exit with an error if asked to modify a file with OCR.)
|
||||
To redo OCR on a file OCRed with other OCR software or a previous
|
||||
version of OCRmyPDF and/or Tesseract, you may use the ``--redo-ocr``
|
||||
argument. (Normally, OCRmyPDF will exit with an error if asked to modify
|
||||
a file with OCR.)
|
||||
|
||||
This may be helpful for users who want to take advantage of accuracy improvements in Tesseract 4.0 for files they previously OCRed with an earlier version of Tesseract and OCRmyPDF.
|
||||
This may be helpful for users who want to take advantage of accuracy
|
||||
improvements in Tesseract 4.0 for files they previously OCRed with an
|
||||
earlier version of Tesseract and OCRmyPDF.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --redo-ocr input.pdf output.pdf
|
||||
|
||||
This method will replace OCR without rasterizing, reducing quality or removing vector content. If a file contains a mix of pure digital text and OCR, digital text will be ignored and OCR will be replaced. As such this mode is incompatible with image processing options, since they alter the appearance of the file.
|
||||
This method will replace OCR without rasterizing, reducing quality or
|
||||
removing vector content. If a file contains a mix of pure digital text
|
||||
and OCR, digital text will be ignored and OCR will be replaced. As such
|
||||
this mode is incompatible with image processing options, since they
|
||||
alter the appearance of the file.
|
||||
|
||||
In some cases, existing OCR cannot be detected or replaced. Files produced by OCRmyPDF v2.2 or earlier, for example, are internally represented as having visible text with an opaque image drawn on top. This situation cannot be detected.
|
||||
In some cases, existing OCR cannot be detected or replaced. Files
|
||||
produced by OCRmyPDF v2.2 or earlier, for example, are internally
|
||||
represented as having visible text with an opaque image drawn on top.
|
||||
This situation cannot be detected.
|
||||
|
||||
If ``--redo-ocr`` does not work, you can use ``--force-ocr``, which will force rasterization of all pages, potentially reducing quality or losing vector content.
|
||||
If ``--redo-ocr`` does not work, you can use ``--force-ocr``, which will
|
||||
force rasterization of all pages, potentially reducing quality or losing
|
||||
vector content.
|
||||
|
||||
Improving OCR quality
|
||||
---------------------
|
||||
=====================
|
||||
|
||||
The `Image processing`_ features can improve OCR quality.
|
||||
The `Image processing <#image-processing>`__ features can improve OCR
|
||||
quality.
|
||||
|
||||
Rotating pages and deskewing helps to ensure that the page orientation is correct before OCR begins. Removing the background and/or cleaning the page can also improve results. The ``--oversample DPI`` argument can be specified to resample images to higher resolution before attempting OCR; this can improve results as well.
|
||||
Rotating pages and deskewing helps to ensure that the page orientation
|
||||
is correct before OCR begins. Removing the background and/or cleaning
|
||||
the page can also improve results. The ``--oversample DPI`` argument can
|
||||
be specified to resample images to higher resolution before attempting
|
||||
OCR; this can improve results as well.
|
||||
|
||||
OCR quality will suffer if the resolution of input images is not correct (since the range of pixel sizes that will be checked for possible fonts will also be incorrect).
|
||||
OCR quality will suffer if the resolution of input images is not correct
|
||||
(since the range of pixel sizes that will be checked for possible fonts
|
||||
will also be incorrect).
|
||||
|
||||
PDF optimization
|
||||
----------------
|
||||
================
|
||||
|
||||
By default OCRmyPDF will attempt to perform lossless optimizations on the images inside PDFs after OCR is complete. Optimization is performed even if no OCR text is found.
|
||||
By default OCRmyPDF will attempt to perform lossless optimizations on
|
||||
the images inside PDFs after OCR is complete. Optimization is performed
|
||||
even if no OCR text is found.
|
||||
|
||||
The ``--optimize N`` (short form ``-O``) argument controls optimization, where ``N`` ranges from 0 to 3 inclusive, analogous to the optimization levels in the GCC compiler.
|
||||
The ``--optimize N`` (short form ``-O``) argument controls optimization,
|
||||
where ``N`` ranges from 0 to 3 inclusive, analogous to the optimization
|
||||
levels in the GCC compiler.
|
||||
|
||||
.. list-table::
|
||||
:widths: auto
|
||||
@@ -227,9 +344,15 @@ The ``--optimize N`` (short form ``-O``) argument controls optimization, where `
|
||||
* - ``--optimize 3``
|
||||
- All of the above, and enables more aggressive optimizations and targets lower image quality.
|
||||
|
||||
Optimization is improved when a JBIG2 encoder is available and when ``pngquant`` is installed. If either of these components are missing, then some types of images cannot be optimized.
|
||||
Optimization is improved when a JBIG2 encoder is available and when
|
||||
``pngquant`` is installed. If either of these components are missing,
|
||||
then some types of images cannot be optimized.
|
||||
|
||||
The types of optimization available may expand over time. By default, OCRmyPDF compresses data streams inside PDFs, and will change inefficient compression modes to more modern versions. A program like ``qpdf`` can be used to change encodings, e.g. to inspect the internals fo a PDF.
|
||||
The types of optimization available may expand over time. By default,
|
||||
OCRmyPDF compresses data streams inside PDFs, and will change
|
||||
inefficient compression modes to more modern versions. A program like
|
||||
``qpdf`` can be used to change encodings, e.g. to inspect the internals
|
||||
fo a PDF.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
|
||||
219
docs/docker.rst
219
docs/docker.rst
@@ -1,155 +1,196 @@
|
||||
.. _docker:
|
||||
|
||||
=====================
|
||||
OCRmyPDF Docker image
|
||||
=====================
|
||||
|
||||
OCRmyPDF is also available in a Docker image that packages recent versions of all dependencies.
|
||||
OCRmyPDF is also available in a Docker image that packages recent
|
||||
versions of all dependencies.
|
||||
|
||||
For users who already have Docker installed this may be an easy and convenient option. However, it is less performant than a system installation and may require Docker engine configuration.
|
||||
For users who already have Docker installed this may be an easy and
|
||||
convenient option. However, it is less performant than a system
|
||||
installation and may require Docker engine configuration.
|
||||
|
||||
OCRmyPDF needs a generous amount of RAM, CPU cores, and temporary storage space.
|
||||
OCRmyPDF needs a generous amount of RAM, CPU cores, temporary storage
|
||||
space, whether running in a Docker container or on its own. It may be
|
||||
necessary to ensure the container is provisioned with additional
|
||||
resources.
|
||||
|
||||
.. _docker-install:
|
||||
|
||||
Installing the Docker image
|
||||
---------------------------
|
||||
===========================
|
||||
|
||||
If you have `Docker <https://docs.docker.com/>`_ installed on your system, you can install a Docker image of the latest release.
|
||||
If you have `Docker <https://docs.docker.com/>`__ installed on your
|
||||
system, you can install a Docker image of the latest release.
|
||||
|
||||
The recommended OCRmyPDF Docker image is currently named ``ocrmypdf-alpine``:
|
||||
If you can run this command successfully, your system is ready to download and
|
||||
execute the image:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker pull jbarlow83/ocrmypdf-alpine
|
||||
docker run hello-world
|
||||
|
||||
Follow the Docker installation instructions for your platform. If you can run this command successfully, your system is ready to download and execute the image:
|
||||
The recommended OCRmyPDF Docker image is currently named ``ocrmypdf``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run hello-world
|
||||
docker pull jbarlow83/ocrmypdf
|
||||
|
||||
OCRmyPDF will use all available CPU cores. By default, the VirtualBox machine instance on Windows and macOS has only a single CPU core enabled. Use the VirtualBox Manager to determine the name of your Docker engine host, and then follow these optional steps to enable multiple CPUs:
|
||||
|
||||
OCRmyPDF will use all available CPU cores. By default, the VirtualBox
|
||||
machine instance on Windows and macOS has only a single CPU core
|
||||
enabled. Use the VirtualBox Manager to determine the name of your Docker
|
||||
engine host, and then follow these optional steps to enable multiple
|
||||
CPUs:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Optional step for Mac OS X users
|
||||
docker-machine stop "yourVM"
|
||||
VBoxManage modifyvm "yourVM" --cpus 2 # or whatever number of core is desired
|
||||
docker-machine start "yourVM"
|
||||
eval $(docker-machine env "yourVM")
|
||||
# Optional step for Mac OS X users
|
||||
docker-machine stop "yourVM"
|
||||
VBoxManage modifyvm "yourVM" --cpus 2 # or whatever number of core is desired
|
||||
docker-machine start "yourVM"
|
||||
eval $(docker-machine env "yourVM")
|
||||
|
||||
See the Docker documentation for
|
||||
`adjusting memory and CPU on other platforms <https://docs.docker.com/config/containers/resource_constraints/>`__.
|
||||
|
||||
Using the Docker image on the command line
|
||||
------------------------------------------
|
||||
==========================================
|
||||
|
||||
**Unlike typical Docker containers**, in this mode we are using the OCRmyPDF Docker container is intended to be emphemeral – it runs for one OCR job and then terminates, just like a command line program. We are using Docker as a way of delivering an application, not a server.
|
||||
**Unlike typical Docker containers**, in this section the OCRmyPDF Docker
|
||||
container is emphemeral – it runs for one OCR job and terminates, just like a
|
||||
command line program. We are using Docker to deliver an application (as opposed
|
||||
to the more conventional case, where a Docker container runs as a server).
|
||||
|
||||
To start a Docker container (instance of the image):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker tag jbarlow83/ocrmypdf-alpine ocrmypdf
|
||||
docker run --rm ocrmypdf (... all other arguments here...)
|
||||
docker tag jbarlow83/ocrmypdf ocrmypdf
|
||||
docker run --rm -i ocrmypdf (... all other arguments here...) - -
|
||||
|
||||
For convenience, create a shell alias to hide the Docker command:
|
||||
For convenience, create a shell alias to hide the Docker command. It is
|
||||
easier to send the input file as stdin and read the output from
|
||||
stdout – **this avoids the messy permission issues with Docker entirely**.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
alias ocrmypdf='docker run --rm -v "$(pwd):/home/docker" ocrmypdf'
|
||||
ocrmypdf --version # runs docker version
|
||||
alias docker_ocrmypdf='docker run --rm -i ocrmypdf'
|
||||
docker_ocrmypdf --version # runs docker version
|
||||
docker_ocrmypdf - - <input.pdf >output.pdf
|
||||
|
||||
Or in the wonderful `fish shell <https://fishshell.com/>`_:
|
||||
Or in the wonderful `fish shell <https://fishshell.com/>`__:
|
||||
|
||||
.. code-block:: fish
|
||||
|
||||
alias ocrmypdf 'docker run --rm ocrmypdf'
|
||||
funcsave ocrmypdf
|
||||
alias docker_ocrmypdf 'docker run --rm ocrmypdf'
|
||||
funcsave docker_ocrmypdf
|
||||
|
||||
Alternately, you could mount the local current working directory as a
|
||||
Docker volume:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
alias docker_ocrmypdf='docker run --rm -i --user "$(id -u):$(id -g)" --workdir /data -v "$PWD:/data" ocrmypdf'
|
||||
docker_ocrmypdf /data/input.pdf /data/output.pdf
|
||||
|
||||
.. _docker-lang-packs:
|
||||
|
||||
Adding languages to the Docker image
|
||||
------------------------------------
|
||||
====================================
|
||||
|
||||
By default the Docker image includes English, German and Simplified Chinese, the most popular languages for OCRmyPDF users based on feedback. You may add other languages by creating a new Dockerfile based on the public one:
|
||||
By default the Docker image includes English, German, Simplified Chinese,
|
||||
French, Portuguese and Spanish, the most popular languages for OCRmyPDF
|
||||
users based on feedback. You may add other languages by creating a new
|
||||
Dockerfile based on the public one.
|
||||
|
||||
.. code-block:: dockerfile
|
||||
|
||||
FROM jbarlow83/ocrmypdf-alpine
|
||||
FROM jbarlow83/ocrmypdf
|
||||
|
||||
# Add French
|
||||
RUN apk add tesseract-ocr-data-fra
|
||||
# Example: add Italian
|
||||
RUN apt install tesseract-ocr-ita
|
||||
|
||||
To install language packs (training data) such as the
|
||||
`tessdata_best <https://github.com/tesseract-ocr/tessdata_best>`_ suite or
|
||||
custom data, you first need to determine the version of Tesseract data files, which
|
||||
may differ from the Tesseract program version. Use this command to determine the data
|
||||
file version:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run -i --rm --entrypoint /bin/ls jbarlow83/ocrmypdf /usr/share/tesseract-ocr
|
||||
|
||||
As of 2021, the data file version is probably ``4.00``.
|
||||
|
||||
You can then add new data with either a Dockerfile:
|
||||
|
||||
.. code-block:: dockerfile
|
||||
|
||||
FROM jbarlow83/ocrmypdf
|
||||
|
||||
# Example: add a tessdata_best file
|
||||
COPY chi_tra_vert.traineddata /usr/share/tesseract-ocr/<data version>/tessdata/
|
||||
|
||||
Alternately, you can copy training data into a Docker container as follows:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker cp mycustomtraining.traineddata name_of_container:/usr/share/tesseract-ocr/<tesseract version>/tessdata/
|
||||
|
||||
Executing the test suite
|
||||
------------------------
|
||||
========================
|
||||
|
||||
The OCRmyPDF test suite is installed with image. To run it:
|
||||
The OCRmyPDF test suite is installed with image. To run it:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run --entrypoint python3 jbarlow83/ocrmypdf-alpine setup.py test
|
||||
docker run --entrypoint python3 jbarlow83/ocrmypdf -m pytest
|
||||
|
||||
Accessing the shell
|
||||
===================
|
||||
|
||||
To use the bash shell in the Docker image:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run -it --entrypoint bash jbarlow83/ocrmypdf
|
||||
|
||||
Using the OCRmyPDF web service wrapper
|
||||
--------------------------------------
|
||||
======================================
|
||||
|
||||
The OCRmyPDF Docker image includes an example, barebones HTTP web service. The webservice may be launched as follows:
|
||||
The OCRmyPDF Docker image includes an example, barebones HTTP web
|
||||
service. The webservice may be launched as follows:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run --entrypoint python3 -p 5000:5000 jbarlow83/ocrmypdf-alpine webservice.py
|
||||
docker run --entrypoint python3 -p 5000:5000 jbarlow83/ocrmypdf webservice.py
|
||||
|
||||
Unlike command line usage this program will open a socket and wait for connections.
|
||||
This will configure the machine to listen on port 5000. On Linux machines
|
||||
this is port 5000 of localhost. On macOS or Windows machines running
|
||||
Docker, this is port 5000 of the virtual machine that runs your Docker
|
||||
images. You can find its IP address using the command ``docker-machine ip``.
|
||||
|
||||
Unlike command line usage this program will open a socket and wait for
|
||||
connections.
|
||||
|
||||
.. warning::
|
||||
|
||||
The OCRmyPDF web service wrapper is intended for demonstration or development. It provides no security, no authentication, no protection against denial of service attacks, and no load balancing. The default Flask WSGI server is used, which is intended for development only. The server is single-threaded and so can respond to only one client at a time. It cannot respond to clients while busy with OCR.
|
||||
The OCRmyPDF web service wrapper is intended for demonstration or
|
||||
development. It provides no security, no authentication, no
|
||||
protection against denial of service attacks, and no load balancing.
|
||||
The default Flask WSGI server is used, which is intended for
|
||||
development only. The server is single-threaded and so can respond to
|
||||
only one client at a time. While running OCR, it cannot respond to
|
||||
any other clients.
|
||||
|
||||
Clients must keep their open connection while waiting for OCR to complete. This may entail setting a long timeout; this interface is more useful for internal HTTP API calls.
|
||||
Clients must keep their open connection while waiting for OCR to
|
||||
complete. This may entail setting a long timeout; this interface is more
|
||||
useful for internal HTTP API calls.
|
||||
|
||||
Unlike the rest of OCRmyPDF, this web service is licensed under the Affero GPLv3 (AGPLv3) since Ghostscript, a dependency of OCRmyPDF, is also licensed in this way.
|
||||
Unlike the rest of OCRmyPDF, this web service is licensed under the
|
||||
Affero GPLv3 (AGPLv3) since Ghostscript is also licensed in this way.
|
||||
|
||||
In addition to the above, please read our :ref:`general remarks on using OCRmyPDF as a service <ocr-service>`.
|
||||
|
||||
Legacy Ubuntu Docker images
|
||||
---------------------------
|
||||
|
||||
Previously OCRmyPDF was delivered in several Docker images for different purposes, based on Ubuntu.
|
||||
|
||||
The Ubuntu-based images will be maintained for some time but should not be used for new deployments. They are as follows:
|
||||
|
||||
.. list-table::
|
||||
:widths: auto
|
||||
:header-rows: 1
|
||||
|
||||
* - Image name
|
||||
- Download command
|
||||
- Notes
|
||||
* - ocrmypdf
|
||||
- ``docker pull jbarlow83/ocrmypdf``
|
||||
- Latest ocrmypdf with Tesseract 4.0.0-beta1 on Ubuntu 18.04. Includes English, French, German, Spanish, Portugeuse and Simplified Chinese.
|
||||
* - ocrmypdf-polyglot
|
||||
- ``docker pull jbarlow83/ocrmypdf-polyglot``
|
||||
- As above, with all available language packs.
|
||||
* - ocrmypdf-webservice
|
||||
- ``docker pull jbarlow83/ocrmypdf-webservice``
|
||||
- All language packs, and a simple HTTP wrapper allowing OCRmyPDF to be used as a web service. Note that this component is licensed under AGPLv3.
|
||||
|
||||
To execute the Ubuntu-based OCRmyPDF on a local file, you must `provide a writable volume to the Docker image <https://docs.docker.com/userguide/dockervolumes/>`_, and both the input and output file must be inside the writable volume. This limitation applies only to the legacy images.
|
||||
|
||||
This example command uses the current working directory as the writable volume:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run --rm -v "$(pwd):/home/docker" <other docker arguments> ocrmypdf <your arguments to ocrmypdf>
|
||||
|
||||
In this worked example, the current working directory contains an input file called ``test.pdf`` and the output will go to ``output.pdf``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run --rm -v "$(pwd):/home/docker" ocrmypdf --skip-text test.pdf output.pdf
|
||||
|
||||
.. note:: The working directory should be a writable local volume or Docker may not have permission to access it.
|
||||
|
||||
Note that ``ocrmypdf`` has its own separate ``-v VERBOSITYLEVEL`` argument to control debug verbosity. All Docker arguments should before the ``ocrmypdf`` image name and all arguments to ``ocrmypdf`` should be listed after.
|
||||
|
||||
In some environments the permissions associated with Docker can be complex to configure. The process that executes Docker may end up not having the permissions to write the specified file system. In that case one can stream the file into and out of the Docker process and avoid all permission hassles, using ``-`` as the input and output filename:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
docker run --rm -i ocrmypdf <other arguments to ocrmypdf> - - <input.pdf >output.pdf
|
||||
In addition to the above, please read our
|
||||
:ref:`general remarks on using OCRmyPDF as a service <ocr-service>`.
|
||||
|
||||
@@ -1,33 +1,53 @@
|
||||
=====================
|
||||
Common error messages
|
||||
=====================
|
||||
|
||||
Page already has text
|
||||
---------------------
|
||||
=====================
|
||||
|
||||
.. code::
|
||||
.. code-block::
|
||||
|
||||
ERROR - 1: page already has text! – aborting (use --force-ocr to force OCR)
|
||||
ERROR - 1: page already has text! – aborting (use --force-ocr to force OCR)
|
||||
|
||||
You ran ocrmypdf on a file that already contains printable text or a hidden OCR text layer (it can't quite tell the difference). You probably don't want to do this, because the file is already searchable.
|
||||
You ran ocrmypdf on a file that already contains printable text or a
|
||||
hidden OCR text layer (it can't quite tell the difference). You probably
|
||||
don't want to do this, because the file is already searchable.
|
||||
|
||||
As the error message suggests, your options are:
|
||||
|
||||
- ``ocrmypdf --force-ocr`` to :ref:`rasterize <raster-vector>` all vector content and run OCR on the images. This is useful if a previous OCR program failed, or if the document contains a text watermark.
|
||||
|
||||
- ``ocrmypdf --skip-text`` to skip OCR and other processing on any pages that contain text. Text pages will be copied into the output PDF without modification.
|
||||
- ``ocrmypdf --force-ocr`` to :ref:`rasterize <raster-vector>` all
|
||||
vector content and run OCR on the images. This is useful if a
|
||||
previous OCR program failed, or if the document contains a text
|
||||
watermark.
|
||||
- ``ocrmypdf --skip-text`` to skip OCR and other processing on any
|
||||
pages that contain text. Text pages will be copied into the output
|
||||
PDF without modification.
|
||||
- ``ocrmypdf --redo-ocr`` to scan the file for any existing OCR
|
||||
(non-printing text), remove it, and do OCR again. This is one way
|
||||
to take advantage of improvements in OCR accuracy. Printable vector
|
||||
text is excluded from OCR, so this can be used on files that contain
|
||||
a mix of digital and scanned files.
|
||||
|
||||
|
||||
Input file 'filename' is not a valid PDF
|
||||
----------------------------------------
|
||||
========================================
|
||||
|
||||
OCRmyPDF passes files through qpdf, a program that fixes errors in PDFs, before it tries to work on them. In most cases this happens because the PDF is corrupt and
|
||||
truncated (incomplete file copying) and not much can be done.
|
||||
OCRmyPDF checks files with pikepdf, a library that in turn uses libqpdf to fixes
|
||||
errors in PDFs, before it tries to work on them. In most cases this happens
|
||||
because the PDF is corrupt and truncated (incomplete file copying) and not much
|
||||
can be done.
|
||||
|
||||
You can try rewriting the file with Ghostscript or pdftk:
|
||||
You can try rewriting the file with Ghostscript:
|
||||
|
||||
- ``gs -o output.pdf -dSAFER -sDEVICE=pdfwrite input.pdf``
|
||||
.. code-block:: bash
|
||||
|
||||
- ``pdftk input.pdf cat output output.pdf``
|
||||
gs -o output.pdf -dSAFER -sDEVICE=pdfwrite input.pdf
|
||||
|
||||
Sometimes Acrobat can repair PDFs with its `Preflight tool <https://helpx.adobe.com/acrobat/using/correcting-problem-areas-preflight-tool.html>`_.
|
||||
``pdftk`` can also rewrite PDFs:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pdftk input.pdf cat output output.pdf
|
||||
|
||||
Sometimes Acrobat can repair PDFs with its `Preflight
|
||||
tool <https://helpx.adobe.com/acrobat/using/correcting-problem-areas-preflight-tool.html>`__.
|
||||
|
||||
BIN
docs/images/logo-social.png
Normal file
BIN
docs/images/logo-social.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 31 KiB |
75
docs/images/logo.svg
Normal file
75
docs/images/logo.svg
Normal file
@@ -0,0 +1,75 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
<svg width="100%" height="100%" viewBox="0 0 503 227" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" xmlns:serif="http://www.serif.com/" style="fill-rule:evenodd;clip-rule:evenodd;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:1.5;">
|
||||
<g id="svg" transform="matrix(0.965977,0,0,0.807602,0,0)">
|
||||
<rect x="0" y="0" width="520" height="280" style="fill:white;"/>
|
||||
<g transform="matrix(1.03522,0,0,1.23823,-69.7528,-83.422)">
|
||||
<g transform="matrix(1,0,0,1,243.977,20.0703)">
|
||||
<g id="Page">
|
||||
<g transform="matrix(0.961773,0,0,1.05962,6.19811,-3.01071)">
|
||||
<path d="M328.5,97.682C328.5,96.465 327.983,95.296 327.056,94.418C320.026,87.758 289.442,58.78 282.228,51.944C281.251,51.019 279.901,50.496 278.49,50.496C264.493,50.496 188.083,50.496 167.339,50.496C164.468,50.496 162.141,52.609 162.141,55.214C162.141,83.051 162.141,225.565 162.141,253.4C162.141,256.005 164.468,258.117 167.338,258.117C192.242,258.117 299.159,258.117 323.538,258.117C326.278,258.117 328.5,256.101 328.5,253.613C328.5,229.268 328.5,113.896 328.5,97.682Z" style="fill:rgb(253,253,253);stroke:rgb(51,51,51);stroke-width:3.95px;"/>
|
||||
</g>
|
||||
<g id="Dog-ear" serif:id="Dog ear" transform="matrix(1,0,0,1,-4,2)">
|
||||
<path d="M277.072,48.496L277.072,93.848C277.072,95.172 277.598,96.441 278.534,97.377C279.47,98.313 280.739,98.839 282.063,98.839C294.548,98.839 326.141,98.839 326.141,98.839" style="fill:rgb(245,245,245);stroke:rgb(51,51,51);stroke-width:4px;"/>
|
||||
</g>
|
||||
</g>
|
||||
<g transform="matrix(1,0,0,1,-29.6816,-0.395178)">
|
||||
<g transform="matrix(1.00243,0,0,1.11818,-144.72,-8.80181)">
|
||||
<path d="M465.73,119.654C465.73,117.605 463.874,115.941 461.588,115.941L310.259,115.941C307.973,115.941 306.117,117.605 306.117,119.654L306.117,183.108C306.117,185.157 307.973,186.821 310.259,186.821L461.588,186.821C463.874,186.821 465.73,185.157 465.73,183.108L465.73,119.654Z" style="fill:rgb(248,0,0);stroke:white;stroke-width:3.77px;"/>
|
||||
</g>
|
||||
<g transform="matrix(1.24571,0,0,1.35864,116.812,84.3924)">
|
||||
<g transform="matrix(64,0,0,64,42.1437,77.6203)">
|
||||
<path d="M0.084,0L0.084,-0.68L0.297,-0.68C0.371,-0.68 0.434,-0.663 0.487,-0.63C0.54,-0.596 0.566,-0.54 0.566,-0.462C0.566,-0.385 0.538,-0.328 0.481,-0.292C0.424,-0.255 0.36,-0.237 0.288,-0.237L0.213,-0.237L0.213,0L0.084,0ZM0.293,-0.572L0.213,-0.572L0.213,-0.344L0.295,-0.344C0.334,-0.344 0.365,-0.353 0.389,-0.371C0.413,-0.388 0.426,-0.416 0.429,-0.454C0.429,-0.498 0.417,-0.529 0.393,-0.546C0.369,-0.563 0.336,-0.572 0.293,-0.572Z" style="fill:white;fill-rule:nonzero;"/>
|
||||
</g>
|
||||
<g transform="matrix(64,0,0,64,79.7117,77.6203)">
|
||||
<path d="M0.332,0L0.084,0L0.084,-0.68L0.336,-0.68C0.441,-0.68 0.518,-0.648 0.569,-0.585C0.62,-0.522 0.645,-0.441 0.645,-0.344C0.645,-0.239 0.618,-0.155 0.563,-0.093C0.508,-0.031 0.431,0 0.332,0ZM0.337,-0.57L0.213,-0.57L0.213,-0.109L0.33,-0.109C0.385,-0.109 0.429,-0.127 0.462,-0.163C0.495,-0.199 0.511,-0.259 0.511,-0.344C0.511,-0.415 0.497,-0.47 0.469,-0.51C0.441,-0.55 0.397,-0.57 0.337,-0.57Z" style="fill:white;fill-rule:nonzero;"/>
|
||||
</g>
|
||||
<g transform="matrix(64,0,0,64,123.424,77.6203)">
|
||||
<path d="M0.405,-0.288L0.213,-0.288L0.213,0L0.084,0L0.084,-0.68L0.469,-0.68L0.489,-0.578L0.213,-0.578L0.213,-0.389L0.386,-0.389L0.405,-0.288Z" style="fill:white;fill-rule:nonzero;"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
<g transform="matrix(1,0,0,1.52217,67.3796,10.7507)">
|
||||
<rect x="23.501" y="81.3" width="162.305" height="61.77" style="fill:rgb(180,213,255);"/>
|
||||
</g>
|
||||
<g transform="matrix(0.967536,0,0,0.961535,5.90498,47.9703)">
|
||||
<g transform="matrix(90.4804,0,0,90.4804,82.6698,167.705)">
|
||||
<path d="M0.057,-0.337C0.057,-0.442 0.084,-0.527 0.139,-0.594C0.194,-0.66 0.271,-0.694 0.37,-0.696C0.477,-0.696 0.556,-0.662 0.607,-0.593C0.658,-0.524 0.684,-0.441 0.684,-0.344C0.684,-0.239 0.657,-0.153 0.602,-0.086C0.547,-0.019 0.469,0.014 0.37,0.014C0.264,0.014 0.185,-0.02 0.134,-0.089C0.083,-0.157 0.057,-0.24 0.057,-0.337ZM0.192,-0.338C0.192,-0.267 0.206,-0.208 0.235,-0.163C0.264,-0.118 0.308,-0.095 0.369,-0.095C0.424,-0.095 0.467,-0.115 0.5,-0.156C0.533,-0.197 0.549,-0.259 0.549,-0.344C0.549,-0.415 0.535,-0.473 0.506,-0.518C0.477,-0.563 0.433,-0.586 0.372,-0.586C0.319,-0.586 0.275,-0.564 0.242,-0.519C0.209,-0.474 0.192,-0.414 0.192,-0.338Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
|
||||
</g>
|
||||
<g transform="matrix(90.4804,0,0,90.4804,147.906,167.705)">
|
||||
<path d="M0.505,-0.557C0.473,-0.567 0.448,-0.574 0.429,-0.579C0.41,-0.583 0.388,-0.585 0.361,-0.585C0.307,-0.585 0.265,-0.563 0.236,-0.519C0.207,-0.475 0.192,-0.415 0.192,-0.338C0.192,-0.272 0.204,-0.215 0.229,-0.167C0.254,-0.119 0.295,-0.095 0.353,-0.095C0.382,-0.095 0.409,-0.098 0.434,-0.104C0.459,-0.11 0.481,-0.117 0.502,-0.126L0.551,-0.03C0.525,-0.017 0.494,-0.006 0.457,0.002C0.42,0.01 0.388,0.014 0.36,0.014C0.254,0.014 0.177,-0.02 0.129,-0.088C0.081,-0.156 0.057,-0.239 0.057,-0.337C0.057,-0.442 0.084,-0.527 0.137,-0.594C0.19,-0.661 0.266,-0.694 0.365,-0.694C0.385,-0.694 0.413,-0.691 0.448,-0.684C0.483,-0.677 0.516,-0.666 0.545,-0.65L0.505,-0.557Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
|
||||
</g>
|
||||
<g transform="matrix(90.4804,0,0,90.4804,199.751,167.705)">
|
||||
<path d="M0.293,-0.572L0.213,-0.572L0.213,-0.364L0.295,-0.364C0.334,-0.364 0.366,-0.372 0.391,-0.388C0.416,-0.403 0.429,-0.429 0.429,-0.465C0.429,-0.503 0.417,-0.53 0.393,-0.547C0.369,-0.564 0.336,-0.572 0.293,-0.572ZM0.479,0L0.335,-0.26C0.328,-0.259 0.32,-0.259 0.312,-0.259C0.304,-0.258 0.296,-0.258 0.288,-0.258L0.213,-0.258L0.213,0L0.084,0L0.084,-0.68L0.297,-0.68C0.371,-0.68 0.434,-0.663 0.487,-0.629C0.54,-0.595 0.566,-0.542 0.566,-0.471C0.566,-0.429 0.555,-0.393 0.534,-0.363C0.512,-0.332 0.484,-0.309 0.45,-0.292L0.617,0L0.479,0Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
|
||||
</g>
|
||||
</g>
|
||||
<g transform="matrix(0.916882,0,0,1,121.475,-32.6535)">
|
||||
<g transform="matrix(86.953,0,0,86.953,152.996,241.878)">
|
||||
<path d="M0.479,-0.428C0.5,-0.451 0.527,-0.47 0.562,-0.484C0.596,-0.497 0.627,-0.504 0.654,-0.504C0.72,-0.504 0.767,-0.485 0.795,-0.446C0.822,-0.407 0.836,-0.36 0.836,-0.304L0.836,0L0.705,0L0.705,-0.298C0.705,-0.329 0.698,-0.352 0.683,-0.369C0.668,-0.385 0.647,-0.393 0.619,-0.393C0.6,-0.393 0.581,-0.388 0.56,-0.378C0.539,-0.368 0.521,-0.357 0.504,-0.344C0.505,-0.337 0.505,-0.331 0.506,-0.324C0.507,-0.317 0.507,-0.311 0.507,-0.304L0.507,0L0.376,0L0.376,-0.298C0.376,-0.329 0.369,-0.352 0.354,-0.369C0.339,-0.385 0.318,-0.393 0.291,-0.393C0.274,-0.393 0.258,-0.39 0.241,-0.383C0.224,-0.376 0.207,-0.367 0.192,-0.356L0.192,0L0.062,0L0.062,-0.485L0.13,-0.485L0.162,-0.441C0.184,-0.461 0.211,-0.476 0.242,-0.488C0.273,-0.499 0.3,-0.504 0.325,-0.504C0.363,-0.504 0.395,-0.497 0.42,-0.484C0.445,-0.47 0.465,-0.451 0.479,-0.428Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
|
||||
</g>
|
||||
<g transform="matrix(86.953,0,0,86.953,228.906,241.878)">
|
||||
<path d="M0.156,0.023L0.179,-0.034L0.006,-0.467L0.14,-0.485L0.252,-0.191L0.358,-0.485L0.495,-0.485L0.278,0.064C0.263,0.103 0.236,0.137 0.197,0.165C0.158,0.193 0.118,0.212 0.075,0.222L0.029,0.115C0.052,0.105 0.077,0.093 0.104,0.079C0.13,0.064 0.147,0.046 0.156,0.023Z" style="fill:rgb(51,51,51);fill-rule:nonzero;"/>
|
||||
</g>
|
||||
</g>
|
||||
<g id="Selectors" transform="matrix(0.965977,0,0,0.807602,67.3796,67.3718)">
|
||||
<g id="Right-selector" serif:id="Right selector">
|
||||
<g transform="matrix(1.03522,0,0,1.23823,2.07044,0)">
|
||||
<path d="M185.806,161.156L185.806,67.132" style="fill:none;stroke:rgb(76,159,255);stroke-width:4px;stroke-linecap:butt;"/>
|
||||
</g>
|
||||
<g transform="matrix(1.03522,0,0,1.23823,161.788,169.469)">
|
||||
<circle cx="31.523" cy="34.314" r="10.021" style="fill:rgb(76,159,255);stroke:rgb(76,159,255);stroke-width:4px;stroke-linecap:butt;"/>
|
||||
</g>
|
||||
</g>
|
||||
<g id="Left-selector" serif:id="Left selector">
|
||||
<g transform="matrix(1.03522,0,0,1.23823,-170.092,0)">
|
||||
<path d="M185.806,161.156L185.806,67.132" style="fill:none;stroke:rgb(76,159,255);stroke-width:4px;stroke-linecap:butt;"/>
|
||||
</g>
|
||||
<g transform="matrix(1.03522,0,0,1.23823,-10.3742,28.2274)">
|
||||
<circle cx="31.523" cy="34.314" r="10.021" style="fill:rgb(76,159,255);stroke:rgb(76,159,255);stroke-width:4px;stroke-linecap:butt;"/>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 9.3 KiB |
@@ -1,15 +1,12 @@
|
||||
.. ocrmypdf documentation master file, created by
|
||||
sphinx-quickstart on Sun Sep 4 14:29:43 2016.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
OCRmyPDF documentation
|
||||
======================
|
||||
|
||||
OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to
|
||||
be searched.
|
||||
OCRmyPDF adds an optical character recognition (OCR) text layer to scanned PDF
|
||||
files, allowing them to be searched.
|
||||
|
||||
PDF is the best format for storing and exchanging scanned documents. Unfortunately, PDFs can be difficult to modify. OCRmyPDF makes it easy to apply image processing and OCR to existing PDFs.
|
||||
PDF is the best format for storing and exchanging scanned documents.
|
||||
Unfortunately, PDFs can be difficult to modify. OCRmyPDF makes it easy to apply
|
||||
image processing and OCR to existing PDFs.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
@@ -17,6 +14,7 @@ PDF is the best format for storing and exchanging scanned documents. Unfortunat
|
||||
introduction
|
||||
release_notes
|
||||
installation
|
||||
optimizer
|
||||
languages
|
||||
jbig2
|
||||
|
||||
@@ -28,9 +26,18 @@ PDF is the best format for storing and exchanging scanned documents. Unfortunat
|
||||
docker
|
||||
advanced
|
||||
batch
|
||||
security
|
||||
performance
|
||||
pdfsecurity
|
||||
errors
|
||||
|
||||
.. toctree::
|
||||
:caption: Developers
|
||||
:maxdepth: 2
|
||||
|
||||
api
|
||||
plugins
|
||||
apiref
|
||||
contributing
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
===================
|
||||
Installing OCRmyPDF
|
||||
===================
|
||||
|
||||
@@ -7,21 +8,38 @@ Installing OCRmyPDF
|
||||
|latest|
|
||||
|
||||
The easiest way to install OCRmyPDF is to follow the steps for your operating
|
||||
system/platform, although sometimes this version may be out of date.
|
||||
system/platform. This version may be out of date, however.
|
||||
|
||||
If you want to use the latest version of OCRmyPDF, your best bet is to install
|
||||
the most recent version your platform provides, and then upgrade that version by
|
||||
installing the Python binary wheels.
|
||||
These platforms have one-liner installs:
|
||||
|
||||
+-------------------------------+-------------------------------+
|
||||
| Debian, Ubuntu | ``apt install ocrmypdf`` |
|
||||
+-------------------------------+-------------------------------+
|
||||
| Windows Subsystem for Linux | ``apt install ocrmypdf`` |
|
||||
+-------------------------------+-------------------------------+
|
||||
| Fedora | ``dnf install ocrmypdf`` |
|
||||
+-------------------------------+-------------------------------+
|
||||
| macOS | ``brew install ocrmypdf`` |
|
||||
+-------------------------------+-------------------------------+
|
||||
| LinuxBrew | ``brew install ocrmypdf`` |
|
||||
+-------------------------------+-------------------------------+
|
||||
| FreeBSD | ``pkg install py37-ocrmypdf`` |
|
||||
+-------------------------------+-------------------------------+
|
||||
| Conda (WSL, macOS, Linux) | ``conda install ocrmypdf`` |
|
||||
+-------------------------------+-------------------------------+
|
||||
|
||||
More detailed procedures are outlined below. If you want to do a manual
|
||||
install, or install a more recent version than your platform provides, read on.
|
||||
|
||||
.. contents:: Platform-specific steps
|
||||
:depth: 2
|
||||
:local:
|
||||
|
||||
Installing on Linux
|
||||
-------------------
|
||||
===================
|
||||
|
||||
Debian and Ubuntu 16.10 or newer
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
Debian and Ubuntu 18.04 or newer
|
||||
--------------------------------
|
||||
|
||||
.. |deb-stable| image:: https://repology.org/badge/version-for-repo/debian_stable/ocrmypdf.svg
|
||||
:alt: Debian 9 stable ("stretch")
|
||||
@@ -32,118 +50,178 @@ Debian and Ubuntu 16.10 or newer
|
||||
.. |deb-unstable| image:: https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg
|
||||
:alt: Debian unstable
|
||||
|
||||
.. |ubu-1710| image:: https://repology.org/badge/version-for-repo/ubuntu_17_10/ocrmypdf.svg
|
||||
:alt: Ubuntu 17.10
|
||||
|
||||
.. |ubu-1804| image:: https://repology.org/badge/version-for-repo/ubuntu_18_04/ocrmypdf.svg
|
||||
:alt: Ubuntu 18.04 LTS
|
||||
|
||||
.. |ubu-1810| image:: https://repology.org/badge/version-for-repo/ubuntu_18_10/ocrmypdf.svg
|
||||
:alt: Ubuntu 18.10
|
||||
.. |ubu-2004| image:: https://repology.org/badge/version-for-repo/ubuntu_20_04/ocrmypdf.svg
|
||||
:alt: Ubuntu 20.04 LTS
|
||||
|
||||
.. |ubu-2010| image:: https://repology.org/badge/version-for-repo/ubuntu_20_10/ocrmypdf.svg
|
||||
:alt: Ubuntu 20.10
|
||||
|
||||
+-------------------------------------------+
|
||||
| **OCRmyPDF versions in Debian & Ubuntu** |
|
||||
+-------------------------------------------+
|
||||
| |latest| |
|
||||
+-------------------------------------------+
|
||||
| |deb-stable| |deb-testing| |deb-unstable| |
|
||||
+-------------------------------------------+
|
||||
| |ubu-1710| |ubu-1804| |ubu-1810| |
|
||||
+-------------------------------------------+
|
||||
+-----------------------------------------------+
|
||||
| **OCRmyPDF versions in Debian & Ubuntu** |
|
||||
+-----------------------------------------------+
|
||||
| |latest| |
|
||||
+-----------------------------------------------+
|
||||
| |deb-stable| |deb-testing| |deb-unstable| |
|
||||
+-----------------------------------------------+
|
||||
| |ubu-1804| |ubu-2004| |ubu-2010| |
|
||||
+-----------------------------------------------+
|
||||
|
||||
Users of Debian 9 ("stretch") or later or Ubuntu 16.10 or later may simply
|
||||
Users of Debian 9 ("stretch") or later, or Ubuntu 18.04 or later, including users
|
||||
of Windows Subsystem for Linux, may simply
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
apt-get install ocrmypdf
|
||||
|
||||
As indicated in the table above, Debian and Ubuntu releases may lag behind the latest version. If the version available for your platform is out of date, you could opt to install the latest version from source. See `Installing HEAD revision from sources`_.
|
||||
As indicated in the table above, Debian and Ubuntu releases may lag
|
||||
behind the latest version. If the version available for your platform is
|
||||
out of date, you could opt to install the latest version from source.
|
||||
See `Installing HEAD revision from
|
||||
sources <#installing-head-revision-from-sources>`__. Ubuntu 16.10 to 17.10
|
||||
inclusive also had ocrmypdf, but these versions are end of life.
|
||||
|
||||
For full details on version availability for your platform, check the `Debian Package Tracker <https://tracker.debian.org/pkg/ocrmypdf>`_ or `Ubuntu launchpad.net <https://launchpad.net/ocrmypdf>`_.
|
||||
For full details on version availability for your platform, check the
|
||||
`Debian Package Tracker <https://tracker.debian.org/pkg/ocrmypdf>`__ or
|
||||
`Ubuntu launchpad.net <https://launchpad.net/ocrmypdf>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
OCRmyPDF for Debian and Ubuntu currently omit the JBIG2 encoder. OCRmyPDF works fine without it but will produce larger output files. If you build jbig2enc from source, ocrmypdf 7.0.0 and later will automatically detect it (specifically the ``jbig2`` binary) on the ``PATH``. To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
OCRmyPDF for Debian and Ubuntu currently omit the JBIG2 encoder.
|
||||
OCRmyPDF works fine without it but will produce larger output files.
|
||||
If you build jbig2enc from source, ocrmypdf 7.0.0 and later will
|
||||
automatically detect it (specifically the ``jbig2`` binary) on the
|
||||
``PATH``. To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
|
||||
Fedora 29 or newer
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
Fedora
|
||||
------
|
||||
|
||||
.. |fedora-29| image:: https://repology.org/badge/version-for-repo/fedora29/ocrmypdf.svg
|
||||
:alt: Fedora 29
|
||||
.. |fedora-32| image:: https://repology.org/badge/version-for-repo/fedora_32/ocrmypdf.svg
|
||||
:alt: Fedora 32
|
||||
|
||||
.. |fedora-33| image:: https://repology.org/badge/version-for-repo/fedora_33/ocrmypdf.svg
|
||||
:alt: Fedora 33
|
||||
|
||||
.. |fedora-rawhide| image:: https://repology.org/badge/version-for-repo/fedora_rawhide/ocrmypdf.svg
|
||||
:alt: Fedore Rawhide
|
||||
|
||||
+-----------------------------------------------+
|
||||
| **OCRmyPDF version** |
|
||||
+-----------------------------------------------+
|
||||
| |latest| |
|
||||
+-----------------------------------------------+
|
||||
| |fedora-32| |fedora-33| |fedora-rawhide| |
|
||||
+-----------------------------------------------+
|
||||
|
||||
+------------------------------+
|
||||
| **OCRmyPDF version** |
|
||||
+------------------------------+
|
||||
| |latest| |
|
||||
+------------------------------+
|
||||
| |fedora-29| |fedora-rawhide| |
|
||||
+------------------------------+
|
||||
|
||||
Users of Fedora 29 later may simply
|
||||
Users of Fedora 29 or later may simply
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
dnf install ocrmypdf
|
||||
|
||||
For full details on version availability, check the `Fedora Package Tracker
|
||||
<https://apps.fedoraproject.org/packages/ocrmypdf>`_.
|
||||
For full details on version availability, check the `Fedora Package
|
||||
Tracker <https://apps.fedoraproject.org/packages/ocrmypdf>`__.
|
||||
|
||||
If the version available for your platform is out of date, you could opt to
|
||||
install the latest version from source. See `Installing HEAD revision from
|
||||
sources`_.
|
||||
If the version available for your platform is out of date, you could opt
|
||||
to install the latest version from source. See `Installing HEAD revision
|
||||
from sources <#installing-head-revision-from-sources>`__.
|
||||
|
||||
.. note::
|
||||
|
||||
OCRmyPDF for Fedora currently omits the JBIG2 encoder due to patent issues.
|
||||
OCRmyPDF works fine without it but will produce larger output files. If you
|
||||
build jbig2enc from source, ocrmypdf 7.0.0 and later will automatically
|
||||
detect it on the ``PATH``. To add JBIG2 encoding, see `Installing the JBIG2
|
||||
encoder <jbig2>`_.
|
||||
OCRmyPDF for Fedora currently omits the JBIG2 encoder due to patent
|
||||
issues. OCRmyPDF works fine without it but will produce larger output
|
||||
files. If you build jbig2enc from source, ocrmypdf 7.0.0 and later
|
||||
will automatically detect it on the ``PATH``. To add JBIG2 encoding,
|
||||
see `Installing the JBIG2 encoder <jbig2>`__.
|
||||
|
||||
Installing the latest version on Ubuntu 18.04 LTS
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
.. _ubuntu-lts-latest:
|
||||
|
||||
Ubuntu 18.04 includes ocrmypdf 6.1.2. To install a more recent version, first
|
||||
install the system version to get most of the dependencies:
|
||||
Installing the latest version on Ubuntu 20.04 LTS
|
||||
-------------------------------------------------
|
||||
|
||||
Ubuntu 20.04 includes ocrmypdf 9.6.0 - you can install that with ``apt``. To
|
||||
install a more recent version, uninstall the system-provided version of
|
||||
ocrmypdf, and install the following dependencies:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt-get update
|
||||
sudo apt-get install \
|
||||
ocrmypdf \
|
||||
python3-pip
|
||||
sudo apt-get -y remove ocrmypdf # remove system ocrmypdf, if installed
|
||||
sudo apt-get -y update
|
||||
sudo apt-get -y install \
|
||||
ghostscript \
|
||||
icc-profiles-free \
|
||||
liblept5 \
|
||||
libxml2 \
|
||||
pngquant \
|
||||
python3-pip \
|
||||
tesseract-ocr \
|
||||
zlib1g
|
||||
|
||||
There are a few dependency changes between ocrmypdf 6.1.2 and 7.x. Let's get
|
||||
these, too.
|
||||
To install ocrmypdf for the system:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt-get install \
|
||||
libexempi3 \
|
||||
pngquant
|
||||
pip3 install ocrmypdf
|
||||
|
||||
Then install the most recent ocrmypdf for the local user and set the user's ``PATH`` to check for the user's Python packages.
|
||||
To install for the current user only:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
pip3 install --user ocrmypdf
|
||||
|
||||
Ubuntu 18.04 LTS
|
||||
----------------
|
||||
|
||||
Ubuntu 18.04 includes ocrmypdf 6.1.2 - you can install that with ``apt``, but
|
||||
it is quite old now. To install a more recent version, uninstall the old version
|
||||
of ocrmypdf, and install the following dependencies:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt-get -y remove ocrmypdf
|
||||
sudo apt-get -y update
|
||||
sudo apt-get -y install \
|
||||
ghostscript \
|
||||
icc-profiles-free \
|
||||
liblept5 \
|
||||
libxml2 \
|
||||
pngquant \
|
||||
python3-cffi \
|
||||
python3-distutils \
|
||||
python3-pkg-resources \
|
||||
python3-reportlab \
|
||||
qpdf \
|
||||
tesseract-ocr \
|
||||
zlib1g \
|
||||
unpaper
|
||||
|
||||
We will need a newer version of ``pip`` then was available for Ubuntu 18.04:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py
|
||||
|
||||
Then install the most recent ocrmypdf for the local user and set the
|
||||
user's ``PATH`` to check for the user's Python packages.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
python3 -m pip install --user ocrmypdf
|
||||
|
||||
To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
|
||||
Ubuntu 16.04 LTS
|
||||
^^^^^^^^^^^^^^^^
|
||||
----------------
|
||||
|
||||
No package is available for Ubuntu 16.04. OCRmyPDF 8.0 and newer require Python
|
||||
3.6. Ubuntu 16.04 ships Python 3.5, but you can install Python 3.6 on it. Or,
|
||||
you can skip Python 3.6 and install OCRmyPDF 7.x or older - for that procedure,
|
||||
please see the installation documentation for the version of OCRmyPDF you plan
|
||||
to use.
|
||||
No package is available for Ubuntu 16.04. OCRmyPDF 8.0 and newer require
|
||||
Python 3.6. Ubuntu 16.04 ships Python 3.5, but you can install Python
|
||||
3.6 on it. Or, you can skip Python 3.6 and install OCRmyPDF 7.x or older
|
||||
- for that procedure, please see the installation documentation for the
|
||||
version of OCRmyPDF you plan to use.
|
||||
|
||||
**Install system packages for OCRmyPDF**
|
||||
|
||||
@@ -165,13 +243,13 @@ to use.
|
||||
tesseract-ocr \
|
||||
unpaper
|
||||
|
||||
This will install a Python 3.6 binary at ``/usr/bin/python3.6`` alongside the
|
||||
system's Python 3.5. Do not remove the system Python. This will also install
|
||||
Tesseract 4.0 from a PPA, since the version available in Ubuntu 16.04 is too old
|
||||
for OCRmyPDF.
|
||||
This will install a Python 3.6 binary at ``/usr/bin/python3.6``
|
||||
alongside the system's Python 3.5. Do not remove the system Python. This
|
||||
will also install Tesseract 4.0 from a PPA, since the version available
|
||||
in Ubuntu 16.04 is too old for OCRmyPDF.
|
||||
|
||||
Now install pip for Python 3.6. This will install the Python 3.6 version of
|
||||
``pip`` at ``/usr/local/bin/pip``.
|
||||
Now install pip for Python 3.6. This will install the Python 3.6 version
|
||||
of ``pip`` at ``/usr/local/bin/pip``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -179,8 +257,9 @@ Now install pip for Python 3.6. This will install the Python 3.6 version of
|
||||
|
||||
**Install OCRmyPDF**
|
||||
|
||||
OCRmyPDF requires the locale to be set for UTF-8. **On some minimal Ubuntu
|
||||
installations systems**, it may be necessary to set the locale.
|
||||
OCRmyPDF requires the locale to be set for UTF-8. **On some minimal
|
||||
Ubuntu installations**, such as the Ubuntu 16.04 Docker images it may be
|
||||
necessary to set the locale.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -194,111 +273,161 @@ environment variable contains ``$HOME/.local/bin``.
|
||||
.. code-block:: bash
|
||||
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
pip3 install --user ocrmypdf
|
||||
pip3.6 install --user ocrmypdf
|
||||
|
||||
To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
|
||||
Ubuntu 14.04 LTS
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
Installing on Ubuntu 14.04 LTS (trusty) is more difficult than some other
|
||||
options, because of its age. Several backports are required. For explanations of
|
||||
some steps of this procedure, see the similar steps for Ubuntu 16.04.
|
||||
|
||||
Install system dependencies:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt-get update
|
||||
sudo apt-get install \
|
||||
software-properties-common python-software-properties \
|
||||
zlib1g-dev \
|
||||
libexempi3 \
|
||||
libjpeg-dev \
|
||||
libffi-dev \
|
||||
pngquant \
|
||||
qpdf
|
||||
|
||||
We will need backports of Ghostscript 9.16, libav-11 (for unpaper 6.1),
|
||||
Tesseract 4.00 (alpha), and Python 3.6. This will replace Ghostscript and
|
||||
Tesseract 3.x on your system. Python 3.6 will be installed alongside the system
|
||||
Python 3.4.
|
||||
|
||||
If you prefer to not modify your system in this matter, consider using a Docker
|
||||
container.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo add-apt-repository ppa:vshn/ghostscript -y
|
||||
sudo add-apt-repository ppa:heyarje/libav-11 -y
|
||||
sudo add-apt-repository ppa:alex-p/tesseract-ocr -y
|
||||
sudo add-apt-repository ppa:jonathonf/python-3.6 -y
|
||||
|
||||
sudo apt-get update
|
||||
|
||||
sudo apt-get install \
|
||||
python3.6-dev \
|
||||
ghostscript \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
libavformat56 libavcodec56 libavutil54 \
|
||||
wget
|
||||
|
||||
Now we need to install ``pip`` and let it install ocrmypdf:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && python3.6 -m easy_install pip
|
||||
pip3.6 install ocrmypdf
|
||||
|
||||
These installation instructions omit the optional dependency ``unpaper``, which is only available at version 0.4.2 in Ubuntu 14.04. The author could not find a backport of ``unpaper``, and created a .deb package to do the job of installing unpaper 6.1 (for x86 64-bit only):
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget -q 'https://www.dropbox.com/s/vaq0kbwi6e6au80/unpaper_6.1-1.deb?raw=1' -O unpaper_6.1-1.deb
|
||||
sudo dpkg -i unpaper_6.1-1.deb
|
||||
|
||||
To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
|
||||
ArchLinux (AUR)
|
||||
^^^^^^^^^^^^^^^
|
||||
Arch Linux (AUR)
|
||||
----------------
|
||||
|
||||
.. image:: https://repology.org/badge/version-for-repo/aur/ocrmypdf.svg
|
||||
:alt: ArchLinux
|
||||
:target: https://repology.org/metapackage/ocrmypdf
|
||||
|
||||
There is an `ArchLinux User Repository package for ocrmypdf <https://aur.archlinux.org/packages/ocrmypdf/>`_. You can use the following command.
|
||||
There is an `Arch User Repository (AUR) package for OCRmyPDF
|
||||
<https://aur.archlinux.org/packages/ocrmypdf/>`__.
|
||||
|
||||
Installing AUR packages as root is not allowed, so you must first `setup a
|
||||
non-root user
|
||||
<https://wiki.archlinux.org/index.php/Users_and_groups#User_management>`__ and
|
||||
`configure sudo <https://wiki.archlinux.org/index.php/Sudo#Configuration>`__.
|
||||
The standard Docker image, ``archlinux/base:latest``, does **not** have a
|
||||
non-root user configured, so users of that image must follow these guides. If
|
||||
you are using a VM image, such as `the official Vagrant image
|
||||
<https://app.vagrantup.com/archlinux/boxes/archlinux>`__, this work may already
|
||||
be completed for you.
|
||||
|
||||
Next you should install the `base-devel package group
|
||||
<https://www.archlinux.org/groups/x86_64/base-devel/>`__. This includes the
|
||||
standard tooling needed to build packages, such as a compiler and binary tools.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
yaourt -S ocrmypdf
|
||||
sudo pacman -S base-devel
|
||||
|
||||
If you have any difficulties with installation, check the repository package page.
|
||||
Now you are ready to install the OCRmyPDF package.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
curl -O https://aur.archlinux.org/cgit/aur.git/snapshot/ocrmypdf.tar.gz
|
||||
tar xvzf ocrmypdf.tar.gz
|
||||
cd ocrmypdf
|
||||
makepkg -sri
|
||||
|
||||
At this point you will have a working install of OCRmyPDF, but the Tesseract
|
||||
install won’t include any OCR language data. You can install `the
|
||||
tesseract-data package group
|
||||
<https://www.archlinux.org/groups/any/tesseract-data/>`__ to add all supported
|
||||
languages, or use that package listing to identify the appropriate package for
|
||||
your desired language.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo pacman -S tesseract-data-eng
|
||||
|
||||
As an alternative to this manual procedure, consider using an `AUR helper
|
||||
<https://wiki.archlinux.org/index.php/AUR_helpers>`__. Such a tool will
|
||||
automatically fetch, build and install the AUR package, resolve dependencies
|
||||
(including dependencies on AUR packages), and ease the upgrade procedure.
|
||||
|
||||
If you have any difficulties with installation, check the repository package
|
||||
page.
|
||||
|
||||
.. note::
|
||||
|
||||
The OCRmyPDF AUR package currently omits the JBIG2 encoder. OCRmyPDF works
|
||||
fine without it but will produce larger output files. The encoder is
|
||||
available from `the jbig2enc-git AUR package
|
||||
<https://aur.archlinux.org/packages/jbig2enc-git/>`__ and may be installed
|
||||
using the same series of steps as for the installation OCRmyPDF AUR
|
||||
package. Alternatively, it may be built manually from source following the
|
||||
instructions in `Installing the JBIG2 encoder <jbig2>`__. If JBIG2 is
|
||||
installed, OCRmyPDF 7.0.0 and later will automatically detect it.
|
||||
|
||||
Alpine Linux
|
||||
------------
|
||||
|
||||
.. image:: https://repology.org/badge/version-for-repo/alpine_edge/ocrmypdf.svg
|
||||
:alt: Alpine Linux
|
||||
:target: https://repology.org/metapackage/ocrmypdf
|
||||
|
||||
To install OCRmyPDF for Alpine Linux:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
apk add ocrmypdf
|
||||
|
||||
Mageia 7
|
||||
--------
|
||||
|
||||
There is no OS-level packaging available for Mageia, so you must install the
|
||||
dependencies:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# As root user
|
||||
urpmi.update -a
|
||||
urpmi \
|
||||
ghostscript \
|
||||
icc-profiles-openicc \
|
||||
jbig2dec \
|
||||
lib64leptonica5 \
|
||||
pngquant \
|
||||
python3-pip \
|
||||
python3-cffi \
|
||||
python3-distutils-extra \
|
||||
python3-pkg-resources \
|
||||
python3-reportlab \
|
||||
qpdf \
|
||||
tesseract \
|
||||
tesseract-osd \
|
||||
tesseract-eng \
|
||||
tesseract-fra
|
||||
|
||||
To install ocrmypdf for the system:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# As root user
|
||||
pip3 install ocrmypdf
|
||||
ldconfig
|
||||
|
||||
Or, to install for the current user only:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
pip3 install --user ocrmypdf
|
||||
|
||||
Other Linux packages
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
--------------------
|
||||
|
||||
See the `Repology <https://repology.org/metapackage/ocrmypdf/versions>`_ page.
|
||||
See the
|
||||
`Repology <https://repology.org/metapackage/ocrmypdf/versions>`__ page.
|
||||
|
||||
In general, first install the OCRmyPDF package for your system, then optionally use the procedure `Installing with Python pip`_ to install a more recent version.
|
||||
In general, first install the OCRmyPDF package for your system, then
|
||||
optionally use the procedure `Installing with Python
|
||||
pip <#installing-with-python-pip>`__ to install a more recent version.
|
||||
|
||||
Installing on macOS
|
||||
-------------------
|
||||
===================
|
||||
|
||||
Homebrew
|
||||
^^^^^^^^
|
||||
--------
|
||||
|
||||
.. image:: https://img.shields.io/homebrew/v/ocrmypdf.svg
|
||||
:alt: homebrew
|
||||
:target: http://brewformulas.org/Ocrmypdf
|
||||
|
||||
OCRmyPDF is now a standard `Homebrew <https://brew.sh>`_ formula. To install on macOS:
|
||||
OCRmyPDF is now a standard `Homebrew <https://brew.sh>`__ formula. To
|
||||
install on macOS:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
brew install ocrmypdf
|
||||
|
||||
This will include only the English language pack. If you need other languages you can optionally install them all:
|
||||
This will include only the English language pack. If you need other
|
||||
languages you can optionally install them all:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -306,18 +435,26 @@ This will include only the English language pack. If you need other languages yo
|
||||
|
||||
.. note::
|
||||
|
||||
Users who previously installed OCRmyPDF on macOS using ``pip install ocrmypdf`` should remove the pip version (``pip3 uninstall ocrmypdf``) before switching to the Homebrew version.
|
||||
Users who previously installed OCRmyPDF on macOS using
|
||||
``pip install ocrmypdf`` should remove the pip version
|
||||
(``pip3 uninstall ocrmypdf``) before switching to the Homebrew
|
||||
version.
|
||||
|
||||
.. note::
|
||||
|
||||
Users who previously installed OCRmyPDF from the private tap should switch to the mainline version (``brew untap jbarlow83/ocrmypdf``) and install from there.
|
||||
Users who previously installed OCRmyPDF from the private tap should
|
||||
switch to the mainline version (``brew untap jbarlow83/ocrmypdf``)
|
||||
and install from there.
|
||||
|
||||
Manual installation on macOS
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
----------------------------
|
||||
|
||||
These instructions probably work on all macOS supported by Homebrew.
|
||||
These instructions probably work on all macOS supported by Homebrew, and are
|
||||
for installing a more current version of OCRmyPDF than is available from
|
||||
Homebrew. Note that the Homebrew versions usually track the release versions
|
||||
fairly closely.
|
||||
|
||||
If it's not already present, `install Homebrew <http://brew.sh/>`_.
|
||||
If it's not already present, `install Homebrew <http://brew.sh/>`__.
|
||||
|
||||
Update Homebrew:
|
||||
|
||||
@@ -325,20 +462,18 @@ Update Homebrew:
|
||||
|
||||
brew update
|
||||
|
||||
Install or upgrade the required Homebrew packages, if any are missing. To do this, download the ``Brewfile`` that lists all of the dependencies to the current directory, and run ``brew bundle`` to process them (installing or upgrading as needed). ``Brewfile`` is a plain text file.
|
||||
Install or upgrade the required Homebrew packages, if any are missing.
|
||||
To do this, use ``brew edit ocrmypdf`` to obtain a recent list of Homebrew
|
||||
dependencies. You could also check the ``.workflows/build.yml``.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget https://github.com/jbarlow83/OCRmyPDF/raw/master/.travis/Brewfile
|
||||
brew bundle
|
||||
|
||||
This will include the English, French, German and Spanish language packs. If you need other languages you can optionally install them all:
|
||||
This will include the English, French, German and Spanish language
|
||||
packs. If you need other languages you can optionally install them all:
|
||||
|
||||
.. _macos-all-languages:
|
||||
|
||||
.. code-block:: bash
|
||||
.. code-block:: bash
|
||||
|
||||
brew install tesseract --with-all-languages # Option 2: for all language packs
|
||||
brew install tesseract-lang # Option 2: for all language packs
|
||||
|
||||
Update the homebrew pip:
|
||||
|
||||
@@ -364,94 +499,285 @@ The command line program should now be available:
|
||||
|
||||
ocrmypdf --help
|
||||
|
||||
Installing the Docker image
|
||||
Installing on Windows
|
||||
=====================
|
||||
|
||||
Native Windows
|
||||
--------------
|
||||
|
||||
.. note::
|
||||
|
||||
Administrator privileges will be required for some of these steps.
|
||||
|
||||
You must install the following for Windows:
|
||||
|
||||
* Python 3.7 (64-bit) or later
|
||||
* Tesseract 4.0 or later
|
||||
* Ghostscript 9.50 or later
|
||||
|
||||
Using the `Chocolatey <https://chocolatey.org/>`_ package manager, install the
|
||||
following when running in an Administrator command prompt:
|
||||
|
||||
* ``choco install python3``
|
||||
* ``choco install --pre tesseract``
|
||||
* ``choco install ghostscript``
|
||||
* ``choco install pngquant`` (optional)
|
||||
|
||||
The commands above will install Python 3.x (latest version), Tesseract, Ghostscript
|
||||
and pngquant. Chocolatey may also need to install the Windows Visual C++ Runtime
|
||||
DLLs or other Windows patches, and may require a reboot.
|
||||
|
||||
You may then use ``pip`` to install ocrmypdf. (This can performed by a user or
|
||||
Administrator.):
|
||||
|
||||
* ``pip install ocrmypdf``
|
||||
|
||||
Chocolatey automatically selects appropriate versions of these applications. If you
|
||||
are installing them manually, please install 64-bit versions of all applications for
|
||||
64-bit Windows, or 32-bit versions of all applications for 32-bit Windows. Mixing
|
||||
the "bitness" of these programs will lead to errors.
|
||||
|
||||
OCRmyPDF will check the Windows Registry and standard locations in your Program Files
|
||||
for third party software it needs (specifically, Tesseract and Ghostscript). To
|
||||
override the versions OCRmyPDF selects, you can modify the ``PATH`` environment
|
||||
variable. `Follow these directions <https://www.computerhope.com/issues/ch000549.htm#dospath>`_
|
||||
to change the PATH.
|
||||
|
||||
.. warning::
|
||||
|
||||
As of early 2021, users have reported problems with the Microsoft Store version of
|
||||
Python and OCRmyPDF. These issues affect many other third party Python packages.
|
||||
Please download Python from Python.org or Chocolatey instead, and do not use the
|
||||
Microsoft Store version.
|
||||
|
||||
Windows Subsystem for Linux
|
||||
---------------------------
|
||||
|
||||
For some users, installing the Docker image will be easier than installing all of OCRmyPDF's dependencies. For Windows, it is the only option.
|
||||
#. Install Ubuntu 18.04 for Windows Subsystem for Linux, if not already installed.
|
||||
#. Follow the procedure to install :ref:`OCRmyPDF on Ubuntu 18.04 <ubuntu-lts-latest>`.
|
||||
#. Open the Windows command prompt and create a symlink:
|
||||
|
||||
See `OCRmyPDF Docker Image <docker>`_ for more information.
|
||||
.. code-block:: powershell
|
||||
|
||||
Installing on Windows
|
||||
---------------------
|
||||
wsl sudo ln -s /home/$USER/.local/bin/ocrmypdf /usr/local/bin/ocrmypdf
|
||||
|
||||
Direct installation on Windows is not possible. `Install the Docker <docker-install>`_ container as described above. Ensure that your command prompt can run the docker "hello world" container.
|
||||
Then confirm that the expected version from PyPI (|latest|) is installed:
|
||||
|
||||
It would probably not be too difficult to port on Windows. The main reason this has been avoided is the difficulty of packaging and installing the various non-Python dependencies: Tesseract, QPDF, Ghostscript, Leptonica. Pull requests to add or improve Windows support would be quite welcome.
|
||||
.. code-block:: powershell
|
||||
|
||||
wsl ocrmypdf --version
|
||||
|
||||
You can then run OCRmyPDF in the Windows command prompt or Powershell, prefixing
|
||||
``wsl``, and call it from Windows programs or batch files.
|
||||
|
||||
Cygwin64
|
||||
--------
|
||||
|
||||
First install the the following prerequisite Cygwin packages using ``setup-x86_64.exe``::
|
||||
|
||||
python36 (or later)
|
||||
python3?-devel
|
||||
python3?-pip
|
||||
python3?-lxml
|
||||
python3?-imaging
|
||||
|
||||
(where 3? means match the version of python3 you installed)
|
||||
|
||||
gcc-g++
|
||||
ghostscript (<=9.50 or >=9.52-2 see note below)
|
||||
libexempi3
|
||||
libexempi-devel
|
||||
libffi6
|
||||
libffi-devel
|
||||
pngquant
|
||||
qpdf
|
||||
libqpdf-devel
|
||||
tesseract-ocr
|
||||
tesseract-ocr-devel
|
||||
|
||||
.. note::
|
||||
|
||||
The Cygwin package for Ghostscript in versions 9.52 and
|
||||
9.52-1 contained a bug that caused an exception to occur when
|
||||
ocrmypdf invoked gs. Make sure you have either 9.50 (or earlier)
|
||||
or 9.52-2 (or later).
|
||||
|
||||
Then open a Cygwin terminal (i.e. ``mintty``), run the following commands. Note
|
||||
that if you are using the version of ``pip`` that was installed with the Cygwin
|
||||
Python package, the command name will be ``pip3``. If you have since updated
|
||||
``pip`` (with, for instance ``pip3 install --upgrade pip``) the the command is
|
||||
likely just ``pip`` instead of ``pip3``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip3 install wheel
|
||||
pip3 install ocrmypdf
|
||||
|
||||
The optional dependency "unpaper" that is currently not available under Cygwin.
|
||||
Without it, certain options such as ``--clean`` will produce an error message.
|
||||
However, the OCR-to-text-layer functionality is available.
|
||||
|
||||
Docker
|
||||
------
|
||||
|
||||
You can also :ref:`Install the Docker <docker>` container on Windows. Ensure that
|
||||
your command prompt can run the docker "hello world" container.
|
||||
|
||||
Installing on FreeBSD
|
||||
=====================
|
||||
|
||||
.. image:: https://repology.org/badge/version-for-repo/freebsd/python:ocrmypdf.svg
|
||||
:alt: FreeBSD
|
||||
:target: https://repology.org/project/python:ocrmypdf/versions
|
||||
|
||||
FreeBSD 11.3, 12.0, 12.1-RELEASE and 13.0-CURRENT are supported. Other
|
||||
versions likely work but have not been tested.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pkg install py37-ocrmypdf
|
||||
|
||||
To install a more recent version, you could attempt to first install the system
|
||||
version with ``pkg``, then use ``pip install --user ocrmypdf``.
|
||||
|
||||
Installing the Docker image
|
||||
===========================
|
||||
|
||||
For some users, installing the Docker image will be easier than
|
||||
installing all of OCRmyPDF's dependencies.
|
||||
|
||||
See :ref:`docker` for more information.
|
||||
|
||||
Installing with Python pip
|
||||
--------------------------
|
||||
==========================
|
||||
|
||||
OCRmyPDF is delivered by PyPI because it is a convenient way to install the latest version. However, PyPI and ``pip`` cannot address the fact that ``ocrmypdf`` depends on certain non-Python system libraries and programs being instsalled.
|
||||
OCRmyPDF is delivered by PyPI because it is a convenient way to install
|
||||
the latest version. However, PyPI and ``pip`` cannot address the fact
|
||||
that ``ocrmypdf`` depends on certain non-Python system libraries and
|
||||
programs being installed.
|
||||
|
||||
For best results, first install `your platform's version <https://repology.org/metapackage/ocrmypdf/versions>`_ of ``ocrmypdf``, using the instructions elsewhere in this document. Then you can use ``pip`` to get the latest version if your platform version is out of date. Chances are that this will satisfy most dependencies.
|
||||
.. warning::
|
||||
|
||||
Debian and Ubuntu users: unfortunately, Debian and Ubuntu customize
|
||||
Python in non-standard ways, and the nature of these customizations
|
||||
varies from release to release. This can make for a frustrating
|
||||
user experience. The instructions below work on almost all platforms that
|
||||
have Python installed, except for Debian and Ubuntu, where you may need
|
||||
to take additional steps. For best results on Debian and Ubuntu, use the
|
||||
``apt`` packages; or if these are too old, run
|
||||
``apt install python3-pip python3-venv``, create a virtual environment,
|
||||
and install OCRmyPDF in that environment.
|
||||
|
||||
`See here for more inforation on Debian-Python issues
|
||||
<https://gist.github.com/tiran/2dec9e03c6f901814f6d1e8dad09528e>`__.
|
||||
|
||||
For best results, first install `your platform's
|
||||
version <https://repology.org/metapackage/ocrmypdf/versions>`__ of
|
||||
``ocrmypdf``, using the instructions elsewhere in this document. Then
|
||||
you can use ``pip`` to get the latest version if your platform version
|
||||
is out of date. Chances are that this will satisfy most dependencies.
|
||||
|
||||
Use ``ocrmypdf --version`` to confirm what version was installed.
|
||||
|
||||
Then you can install the latest OCRmyPDF from the Python wheels. First try:
|
||||
Then you can install the latest OCRmyPDF from the Python wheels. First
|
||||
try:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip3 install --user ocrmypdf
|
||||
|
||||
You should then be able to run ``ocrmypdf --version`` and see that the latest version was located.
|
||||
You should then be able to run ``ocrmypdf --version`` and see that the
|
||||
latest version was located.
|
||||
|
||||
Since ``pip3 install --user`` does not work correctly on some platforms, notably Ubuntu 16.04 and older, and the Homebrew version of Python, instead use this for a system wide installation:
|
||||
Since ``pip3 install --user`` does not work correctly on some platforms,
|
||||
notably Ubuntu 16.04 and older, and the Homebrew version of Python,
|
||||
instead use this for a system wide installation:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip3 install ocrmypdf
|
||||
|
||||
.. note::
|
||||
|
||||
AArch64 (ARM64) users: this process will be difficult because most
|
||||
Python packages are not available as binary wheels for your platform.
|
||||
You're probably better off using a platform install on Debian, Ubuntu,
|
||||
or Fedora.
|
||||
|
||||
Requirements for pip and HEAD install
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
-------------------------------------
|
||||
|
||||
OCRmyPDF currently requires these external programs and libraries to be installed, and must be satisfied using the operating system package manager. ``pip`` cannot provide them.
|
||||
OCRmyPDF currently requires these external programs and libraries to be
|
||||
installed, and must be satisfied using the operating system package
|
||||
manager. ``pip`` cannot provide them.
|
||||
|
||||
- Python 3.6 or newer
|
||||
- Ghostscript 9.15 or newer
|
||||
- qpdf 8.1.0 or newer
|
||||
- Tesseract 4.0.0-alpha or newer
|
||||
- Python 3.6 or newer
|
||||
- Ghostscript 9.15 or newer
|
||||
- qpdf 8.1.0 or newer
|
||||
- Tesseract 4.0.0-beta or newer
|
||||
|
||||
As of ocrmypdf 7.2.1, the following versions are recommended:
|
||||
|
||||
- Python 3.7
|
||||
- Ghostscript 9.23 or newer
|
||||
- qpdf 8.2.1
|
||||
- Tesseract 4.0.0 or newer
|
||||
- jbig2enc 0.29 or newer
|
||||
- pngquant 2.5 or newer
|
||||
- unpaper 6.1
|
||||
- Python 3.7 or 3.8
|
||||
- Ghostscript 9.23 or newer
|
||||
- qpdf 8.2.1
|
||||
- Tesseract 4.0.0 or newer
|
||||
- jbig2enc 0.29 or newer
|
||||
- pngquant 2.5 or newer
|
||||
- unpaper 6.1
|
||||
|
||||
jbig2enc, pngquant, and unpaper are optional. If missing certain features are disabled. OCRmyPDF will discover them as soon as they are available.
|
||||
jbig2enc, pngquant, and unpaper are optional. If missing certain
|
||||
features are disabled. OCRmyPDF will discover them as soon as they are
|
||||
available.
|
||||
|
||||
**jbig2enc**, if present, will be used to optimize the encoding of monochrome images. This can significantly reduce the file size of the output file. It is not required. `jbig2enc <https://github.com/agl/jbig2enc>`_ is not generally available for Ubuntu or Debian due to lingering concerns about patent issues, but can easily be built from source. To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
**jbig2enc**, if present, will be used to optimize the encoding of
|
||||
monochrome images. This can significantly reduce the file size of the
|
||||
output file. It is not required.
|
||||
`jbig2enc <https://github.com/agl/jbig2enc>`__ is not generally
|
||||
available for Ubuntu or Debian due to lingering concerns about patent
|
||||
issues, but can easily be built from source. To add JBIG2 encoding, see
|
||||
:ref:`jbig2`.
|
||||
|
||||
**pngquant**, if present, is optionally used to optimize the encoding of PNG-style images in PDFs (actually, any that are that losslessly encoded) by lossily quantizing to a smaller color palette. It is only activated then the ``--optimize`` argument is ``2`` or ``3``.
|
||||
**pngquant**, if present, is optionally used to optimize the encoding of
|
||||
PNG-style images in PDFs (actually, any that are that losslessly
|
||||
encoded) by lossily quantizing to a smaller color palette. It is only
|
||||
activated then the ``--optimize`` argument is ``2`` or ``3``.
|
||||
|
||||
**unpaper**, if present, enables the ``--clean`` and ``--clean-final`` command line options.
|
||||
|
||||
These are in addition to the Python packaging dependencies, meaning that unfortunately, the ``pip install`` command cannot satisfy all of them.
|
||||
**unpaper**, if present, enables the ``--clean`` and ``--clean-final``
|
||||
command line options.
|
||||
|
||||
These are in addition to the Python packaging dependencies, meaning that
|
||||
unfortunately, the ``pip install`` command cannot satisfy all of them.
|
||||
|
||||
Installing HEAD revision from sources
|
||||
-------------------------------------
|
||||
=====================================
|
||||
|
||||
If you have ``git`` and Python 3.6 or newer installed, you can install from source. When the ``pip`` installer runs, it will alert you if dependencies are missing.
|
||||
If you have ``git`` and Python 3.6 or newer installed, you can install
|
||||
from source. When the ``pip`` installer runs, it will alert you if
|
||||
dependencies are missing.
|
||||
|
||||
If you prefer to build every from source, you will need to `build pikepdf from source <https://pikepdf.readthedocs.io/en/latest/installation.html#building-from-source>`_. First ensure you can build and install pikepdf.
|
||||
If you prefer to build every from source, you will need to `build
|
||||
pikepdf from
|
||||
source <https://pikepdf.readthedocs.io/en/latest/installation.html#building-from-source>`__.
|
||||
First ensure you can build and install pikepdf.
|
||||
|
||||
To install the HEAD revision from sources in the current Python 3 environment:
|
||||
To install the HEAD revision from sources in the current Python 3
|
||||
environment:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip3 install git+https://github.com/jbarlow83/OCRmyPDF.git
|
||||
|
||||
Or, to install in `development mode <https://pythonhosted.org/setuptools/setuptools.html#development-mode>`_, allowing customization of OCRmyPDF, use the ``-e`` flag:
|
||||
Or, to install in `development
|
||||
mode <https://pythonhosted.org/setuptools/setuptools.html#development-mode>`__,
|
||||
allowing customization of OCRmyPDF, use the ``-e`` flag:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
pip3 install -e git+https://github.com/jbarlow83/OCRmyPDF.git
|
||||
|
||||
You may find it easiest to install in a virtual environment, rather than system-wide:
|
||||
You may find it easiest to install in a virtual environment, rather than
|
||||
system-wide:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -461,8 +787,8 @@ You may find it easiest to install in a virtual environment, rather than system-
|
||||
cd OCRmyPDF
|
||||
pip3 install .
|
||||
|
||||
However, ``ocrmypdf`` will only be accessible on the system PATH
|
||||
when you activate the virtual environment.
|
||||
However, ``ocrmypdf`` will only be accessible on the system PATH when
|
||||
you activate the virtual environment.
|
||||
|
||||
To run the program:
|
||||
|
||||
@@ -476,7 +802,7 @@ dependencies. Older version than the ones mentioned in the release notes
|
||||
are likely not to be compatible to OCRmyPDF.
|
||||
|
||||
For development
|
||||
^^^^^^^^^^^^^^^
|
||||
---------------
|
||||
|
||||
To install all of the development and test requirements:
|
||||
|
||||
@@ -492,15 +818,17 @@ To install all of the development and test requirements:
|
||||
To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
|
||||
Shell completions
|
||||
-----------------
|
||||
=================
|
||||
|
||||
Completions for ``bash`` and ``fish`` are available in the project's
|
||||
``misc/completion`` folder. The ``bash`` completions are likely ``zsh``
|
||||
compatible but this has not been confirmed. Package maintainers, please install
|
||||
these at the appropriate locations for your system.
|
||||
compatible but this has not been confirmed. Package maintainers, please
|
||||
install these at the appropriate locations for your system.
|
||||
|
||||
To manually install the ``bash`` completion, copy ``misc/completion/ocrmypdf.bash`` to
|
||||
``/etc/bash_completion.d/ocrmypdf`` (rename the file).
|
||||
To manually install the ``bash`` completion, copy
|
||||
``misc/completion/ocrmypdf.bash`` to ``/etc/bash_completion.d/ocrmypdf``
|
||||
(rename the file).
|
||||
|
||||
To manually install the ``fish`` completion, copy ``misc/completion/ocrmypdf.fish`` to
|
||||
To manually install the ``fish`` completion, copy
|
||||
``misc/completion/ocrmypdf.fish`` to
|
||||
``~/.config/fish/completions/ocrmypdf.fish``.
|
||||
|
||||
@@ -1,119 +1,233 @@
|
||||
============
|
||||
Introduction
|
||||
============
|
||||
OCRmyPDF is a Python 3 package that adds OCR layers to PDFs.
|
||||
|
||||
OCRmyPDF is a Python 3 application and library that adds OCR layers to PDFs.
|
||||
|
||||
About OCR
|
||||
---------
|
||||
=========
|
||||
|
||||
`Optical character recognition <https://en.wikipedia.org/wiki/Optical_character_recognition>`_ is technology that converts images of typed or handwritten text, such as in a scanned document, to computer text that can be searched and copied.
|
||||
`Optical character
|
||||
recognition <https://en.wikipedia.org/wiki/Optical_character_recognition>`__
|
||||
is technology that converts images of typed or handwritten text, such as
|
||||
in a scanned document, to computer text that can be selected, searched and copied.
|
||||
|
||||
OCRmyPDF uses `Tesseract <https://github.com/tesseract-ocr/tesseract>`_, the best available open source OCR engine, to perform OCR.
|
||||
OCRmyPDF uses
|
||||
`Tesseract <https://github.com/tesseract-ocr/tesseract>`__, the best
|
||||
available open source OCR engine, to perform OCR.
|
||||
|
||||
.. _raster-vector:
|
||||
|
||||
About PDFs
|
||||
----------
|
||||
==========
|
||||
|
||||
PDFs are page description files that attempts to preserve a layout exactly. They contain `vector graphics <http://vector-conversions.com/vectorizing/raster_vs_vector.html>`_ that can contain raster objects such as scanned images. Because PDFs can contain multiple pages (unlike many image formats) and can contain fonts and text, it is a good formats for exchanging scanned documents.
|
||||
PDFs are page description files that attempts to preserve a layout
|
||||
exactly. They contain `vector
|
||||
graphics <http://vector-conversions.com/vectorizing/raster_vs_vector.html>`__
|
||||
that can contain raster objects such as scanned images. Because PDFs can
|
||||
contain multiple pages (unlike many image formats) and can contain fonts
|
||||
and text, it is a good formats for exchanging scanned documents.
|
||||
|
||||
.. image:: images/bitmap_vs_svg.svg
|
||||
|image|
|
||||
|
||||
A PDF page might contain multiple images, even if it only appears to have one image. Some scanners or scanning software will segment pages into monochromatic text and color regions for example, to improve the compression ratio and appearance of the page.
|
||||
|
||||
Rasterizing a PDF is the process of generating an image suitable for display or analyzing with an OCR engine. OCR engines like Tesseract work with images, not vector objects.
|
||||
A PDF page might contain multiple images, even if it only appears to
|
||||
have one image. Some scanners or scanning software will segment pages
|
||||
into monochromatic text and color regions for example, to improve the
|
||||
compression ratio and appearance of the page.
|
||||
|
||||
Rasterizing a PDF is the process of generating an image suitable for
|
||||
display or analyzing with an OCR engine. OCR engines like Tesseract work
|
||||
with images, not vector objects.
|
||||
|
||||
About PDF/A
|
||||
-----------
|
||||
===========
|
||||
|
||||
`PDF/A <https://en.wikipedia.org/wiki/PDF/A>`_ is an ISO-standardized subset of the full PDF specification that is designed for archiving (the 'A' stands for Archive). PDF/A differs from PDF primarily by omitting features that would make it difficult to read the file in the future, such as embedded Javascript, video, audio and references to external fonts. All fonts and resources needed to interpret the PDF must be contained within it. Because PDF/A disables Javascript and other types of embedded content, it is probably more secure.
|
||||
`PDF/A <https://en.wikipedia.org/wiki/PDF/A>`__ is an ISO-standardized
|
||||
subset of the full PDF specification that is designed for archiving (the
|
||||
'A' stands for Archive). PDF/A differs from PDF primarily by omitting
|
||||
features that would make it difficult to read the file in the future,
|
||||
such as embedded Javascript, video, audio and references to external
|
||||
fonts. All fonts and resources needed to interpret the PDF must be
|
||||
contained within it. Because PDF/A disables Javascript and other types
|
||||
of embedded content, it is probably more secure.
|
||||
|
||||
There are various conformance levels and versions, such as "PDF/A-2b".
|
||||
|
||||
Generally speaking, the best format for scanned documents is PDF/A. Some governments and jurisdictions, US Courts in particular, `mandate the use of PDF/A <https://pdfblog.com/2012/02/13/what-is-pdfa/>`_ for scanned documents.
|
||||
Generally speaking, the best format for scanned documents is PDF/A. Some
|
||||
governments and jurisdictions, US Courts in particular, `mandate the use
|
||||
of PDF/A <https://pdfblog.com/2012/02/13/what-is-pdfa/>`__ for scanned
|
||||
documents.
|
||||
|
||||
Since most people who scan documents are interested in reading them indefinitely into the future, OCRmyPDF generates PDF/A-2b by default.
|
||||
|
||||
PDF/A has a few drawbacks. Some PDF viewers include an alert that the file is a PDF/A, which may confuse some users. It also tends to produce larger files than PDF, because it embeds certain resources even if they are commonly available. PDF/A files can be digitally signed, but may not be encrypted, to ensure they can be read in the future. Fortunately, converting from PDF/A to a regular PDF is trivial, and any PDF viewer can view PDF/A.
|
||||
Since most people who scan documents are interested in reading them
|
||||
indefinitely into the future, OCRmyPDF generates PDF/A-2b by default.
|
||||
|
||||
PDF/A has a few drawbacks. Some PDF viewers include an alert that the
|
||||
file is a PDF/A, which may confuse some users. It also tends to produce
|
||||
larger files than PDF, because it embeds certain resources even if they
|
||||
are commonly available. PDF/A files can be digitally signed, but may not
|
||||
be encrypted, to ensure they can be read in the future. Fortunately,
|
||||
converting from PDF/A to a regular PDF is trivial, and any PDF viewer
|
||||
can view PDF/A.
|
||||
|
||||
What OCRmyPDF does
|
||||
------------------
|
||||
==================
|
||||
|
||||
OCRmyPDF analyzes each page of a PDF to determine the colorspace and resolution (DPI) needed to capture all of the information on that page without losing content. It uses `Ghostscript <http://ghostscript.com/>`_ to rasterize the page, and then performs on OCR on the rasterized image to create an OCR "layer". The layer is then grafted back onto the original PDF.
|
||||
OCRmyPDF analyzes each page of a PDF to determine the colorspace and
|
||||
resolution (DPI) needed to capture all of the information on that page
|
||||
without losing content. It uses
|
||||
`Ghostscript <http://ghostscript.com/>`__ to rasterize the page, and
|
||||
then performs on OCR on the rasterized image to create an OCR "layer".
|
||||
The layer is then grafted back onto the original PDF.
|
||||
|
||||
While one can use a program like Ghostscript or ImageMagick to get an image and put the image through Tesseract, that actually creates a new PDF and many details may be lost. OCRmyPDF can produce a minimally changed PDF as output.
|
||||
While one can use a program like Ghostscript or ImageMagick to get an
|
||||
image and put the image through Tesseract, that actually creates a new
|
||||
PDF and many details may be lost. OCRmyPDF can produce a minimally
|
||||
changed PDF as output.
|
||||
|
||||
OCRmyPDF also some image processing options like deskew which improve the appearance of files and quality of OCR. When these are used, the OCR layer is grafted onto the processed image instead.
|
||||
|
||||
By default, OCRmyPDF produces archival PDFs – PDF/A, which are a stricter subset of PDF features designed for long term archives. If regular PDFs are desired, this can be disabled with ``--output-type pdf``.
|
||||
OCRmyPDF also some image processing options like deskew which improve
|
||||
the appearance of files and quality of OCR. When these are used, the OCR
|
||||
layer is grafted onto the processed image instead.
|
||||
|
||||
By default, OCRmyPDF produces archival PDFs – PDF/A, which are a
|
||||
stricter subset of PDF features designed for long term archives. If
|
||||
regular PDFs are desired, this can be disabled with
|
||||
``--output-type pdf``.
|
||||
|
||||
Why you shouldn't do this manually
|
||||
----------------------------------
|
||||
==================================
|
||||
|
||||
A PDF is similar to an HTML file, in that it contains document structure along with images. Sometimes a PDF does nothing more than present a full page image, but often there is additional content that would be lost.
|
||||
A PDF is similar to an HTML file, in that it contains document structure
|
||||
along with images. Sometimes a PDF does nothing more than present a full
|
||||
page image, but often there is additional content that would be lost.
|
||||
|
||||
A manual process could work like either of these:
|
||||
|
||||
1. Rasterize each page as an image, OCR the images, and combine the output into a PDF. This preserves the layout of each page, but resamples all images (possibly losing quality, increasing file size, introducing compression artifacts, etc.).
|
||||
1. Rasterize each page as an image, OCR the images, and combine the
|
||||
output into a PDF. This preserves the layout of each page, but
|
||||
resamples all images (possibly losing quality, increasing file size,
|
||||
introducing compression artifacts, etc.).
|
||||
2. Extract each image, OCR, and combine the output into a PDF. This
|
||||
loses the context in which images are used in the PDF, meaning that
|
||||
cropping, rotation and scaling of pages may be lost. Some scanned
|
||||
PDFs use multiple images segmented into black and white, grayscale
|
||||
and color regions, with stencil masks to prevent overlap, as this can
|
||||
enhance the appearance of a file while reducing file size. Clearly,
|
||||
reassembling these images will be easy. This also loses and text or
|
||||
vector art on any pages in a PDF with both scanned and pure digital
|
||||
content.
|
||||
|
||||
2. Extract each image, OCR, and combine the output into a PDF. This loses the context in which images are used in the PDF, meaning that cropping, rotation and scaling of pages may be lost. Some scanned PDFs use multiple images segmented into black and white, grayscale and color regions, with stencil masks to prevent overlap, as this can enhance the appearance of a file while reducing file size. Clearly, reassembling these images will be easy. This also loses and text or vector art on any pages in a PDF with both scanned and pure digital content.
|
||||
In the case of a PDF that is nothing other than a container of images
|
||||
(no rotation, scaling, cropping, one image per page), the second
|
||||
approach can be lossless.
|
||||
|
||||
In the case of a PDF that is nothing other than a container of images (no rotation, scaling, cropping, one image per page), the second approach can be lossless.
|
||||
|
||||
OCRmyPDF uses several strategies depending on input options and the input PDF itself, but generally speaking it rasterizes a page for OCR and then grafts the OCR back onto the original. As such it can handle complex PDFs and still preserve their contents as much as possible.
|
||||
|
||||
OCRmyPDF also supports a many, many edge cases that have cropped over several years of development. We support PDF features like images inside of Form XObjects, and pages with UserUnit scaling. We support rare image formats like non-monochrome 1-bit images. We warn about files you may not to OCR. Thanks to pikepdf and QPDF, we auto-repair PDFs that are damaged. (Not that you need to know what any of these are! You should be able to throw any PDF at it.)
|
||||
OCRmyPDF uses several strategies depending on input options and the
|
||||
input PDF itself, but generally speaking it rasterizes a page for OCR
|
||||
and then grafts the OCR back onto the original. As such it can handle
|
||||
complex PDFs and still preserve their contents as much as possible.
|
||||
|
||||
OCRmyPDF also supports a many, many edge cases that have cropped over
|
||||
several years of development. We support PDF features like images inside
|
||||
of Form XObjects, and pages with UserUnit scaling. We support rare image
|
||||
formats like non-monochrome 1-bit images. We warn about files you may
|
||||
not to OCR. Thanks to pikepdf and QPDF, we auto-repair PDFs that are
|
||||
damaged. (Not that you need to know what any of these are! You should be
|
||||
able to throw any PDF at it.)
|
||||
|
||||
Limitations
|
||||
-----------
|
||||
===========
|
||||
|
||||
OCRmyPDF is limited by the Tesseract OCR engine. As such it experiences these limitations, as do any other programs that rely on Tesseract:
|
||||
OCRmyPDF is limited by the Tesseract OCR engine. As such it experiences
|
||||
these limitations, as do any other programs that rely on Tesseract:
|
||||
|
||||
* The OCR is not as accurate as commercial solutions such as Abbyy.
|
||||
* It is not capable of recognizing handwriting.
|
||||
* It may find gibberish and report this as OCR output.
|
||||
* If a document contains languages outside of those given in the ``-l LANG`` arguments, results may be poor.
|
||||
* It is not always good at analyzing the natural reading order of documents. For example, it may fail to recognize that a document contains two columns, and may try to join text across columns.
|
||||
* Poor quality scans may produce poor quality OCR. Garbage in, garbage out.
|
||||
* It does not expose information about what font family text belongs to.
|
||||
- The OCR is not as accurate as commercial solutions such as Abbyy.
|
||||
- It is not capable of recognizing handwriting.
|
||||
- It may find gibberish and report this as OCR output.
|
||||
- If a document contains languages outside of those given in the
|
||||
``-l LANG`` arguments, results may be poor.
|
||||
- It is not always good at analyzing the natural reading order of
|
||||
documents. For example, it may fail to recognize that a document
|
||||
contains two columns, and may try to join text across columns.
|
||||
- Poor quality scans may produce poor quality OCR. Garbage in, garbage
|
||||
out.
|
||||
- It does not expose information about what font family text belongs
|
||||
to.
|
||||
|
||||
OCRmyPDF is also limited by the PDF specification:
|
||||
|
||||
* PDF encodes the position of text glyphs but does not encode document structure. There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically. Some PDF viewers do a better job of this than others.
|
||||
* Because some popular open source PDF viewers have a particularly hard time with spaces betweem words, OCRmyPDF appends a space to each text element as a workaround (when using ``--pdf-renderer hocr``). While this mixes document structure with graphical information that ideally should be left to the PDF viewer to interpret, it improves compatibility with some viewers and does not cause problems for better ones.
|
||||
- PDF encodes the position of text glyphs but does not encode document
|
||||
structure. There is no markup that divides a document in sections,
|
||||
paragraphs, sentences, or even words (since blank spaces are not
|
||||
represented). As such all elements of document structure including
|
||||
the spaces between words must be derived heuristically. Some PDF
|
||||
viewers do a better job of this than others.
|
||||
- Because some popular open source PDF viewers have a particularly hard
|
||||
time with spaces between words, OCRmyPDF appends a space to each text
|
||||
element as a workaround (when using ``--pdf-renderer hocr``). While
|
||||
this mixes document structure with graphical information that ideally
|
||||
should be left to the PDF viewer to interpret, it improves
|
||||
compatibility with some viewers and does not cause problems for
|
||||
better ones.
|
||||
|
||||
Ghostscript also imposes some limitations:
|
||||
|
||||
* PDFs containing JBIG2-encoded content will be converted to CCITT Group4 encoding, which has lower compression ratios, if Ghostscript PDF/A is enabled.
|
||||
* PDFs containing JPEG 2000-encoded content will be converted to JPEG encoding, which may introduce compression artifacts, if Ghostscript PDF/A is enabled.
|
||||
* Ghostscript may transcode grayscale and color images, either lossy to lossless or lossless to lossy, based on an internal algorithm. This behavior can be suppressed by setting ``--pdfa-image-compression`` to ``jpeg`` or ``lossless`` to set all images to one type or the other. Ghostscript has no option to maintain the input image's format. (Ghostscript 9.25+ can copy JPEG images without transcoding them; earlier versions will transcode.)
|
||||
* Ghostscript's PDF/A conversion removes any XMP metadata that is not one of the standard XMP metadata namespaces for PDFs. In particular, PRISM Metdata is removed.
|
||||
- PDFs containing JBIG2-encoded content will be converted to CCITT
|
||||
Group4 encoding, which has lower compression ratios, if Ghostscript
|
||||
PDF/A is enabled.
|
||||
- PDFs containing JPEG 2000-encoded content will be converted to JPEG
|
||||
encoding, which may introduce compression artifacts, if Ghostscript
|
||||
PDF/A is enabled.
|
||||
- Ghostscript may transcode grayscale and color images, either lossy to
|
||||
lossless or lossless to lossy, based on an internal algorithm. This
|
||||
behavior can be suppressed by setting ``--pdfa-image-compression`` to
|
||||
``jpeg`` or ``lossless`` to set all images to one type or the other.
|
||||
Ghostscript has no option to maintain the input image's format.
|
||||
(Ghostscript 9.25+ can copy JPEG images without transcoding them;
|
||||
earlier versions will transcode.)
|
||||
- Ghostscript's PDF/A conversion removes any XMP metadata that is not
|
||||
one of the standard XMP metadata namespaces for PDFs. In particular,
|
||||
PRISM Metdata is removed.
|
||||
- Ghostscript's PDF/A conversion seems to remove or deactivate
|
||||
hyperlinks and other active content.
|
||||
|
||||
You can use ``--output-type pdf`` to disable PDF/A conversion and produce
|
||||
a standard, non-archival PDF.
|
||||
|
||||
Regarding OCRmyPDF itself:
|
||||
|
||||
* PDFs that use transparency are not currently represented in the test suite
|
||||
* The Python API exported by ``import ocrmypdf`` is design to help scripts that use OCRmyPDF but is not currently capable of running OCRmyPDF jobs due to limitations in an underlying library.
|
||||
- PDFs that use transparency are not currently represented in the test
|
||||
suite
|
||||
|
||||
Similar programs
|
||||
----------------
|
||||
================
|
||||
|
||||
To the author's knowledge, OCRmyPDF is the most feature-rich and thoroughly tested command line OCR PDF conversion tool. If it does not meet your needs, contributions and suggestions are welcome. If not, consider one of these similar open source programs:
|
||||
To the author's knowledge, OCRmyPDF is the most feature-rich and
|
||||
thoroughly tested command line OCR PDF conversion tool. If it does not
|
||||
meet your needs, contributions and suggestions are welcome. If not,
|
||||
consider one of these similar open source programs:
|
||||
|
||||
* pdf2pdfocr
|
||||
* pdfsandwich
|
||||
* pypdfocr
|
||||
* pdfbeads
|
||||
- pdf2pdfocr
|
||||
- pdfsandwich
|
||||
- pypdfocr
|
||||
- pdfbeads
|
||||
|
||||
Web front-ends
|
||||
--------------
|
||||
==============
|
||||
|
||||
The Docker image ``ocrmypdf-alpine`` provides a web service front-end that allows files to submitted over HTTP and the results "downloaded". This is an HTTP server intended to simplify web services deployments; it is not intended to be deployed on the public internet and no real security measures to speak of.
|
||||
The Docker image ``ocrmypdf`` provides a web service front-end
|
||||
that allows files to submitted over HTTP and the results "downloaded".
|
||||
This is an HTTP server intended to simplify web services deployments; it
|
||||
is not intended to be deployed on the public internet and no real
|
||||
security measures to speak of.
|
||||
|
||||
In addition, the following third-party integrations are available:
|
||||
|
||||
* `Nextcloud OCR <https://github.com/janis91/ocr>`_ is a free software plugin for the Nextcloud private cloud software
|
||||
- `Nextcloud OCR <https://github.com/janis91/ocr>`__ is a free software
|
||||
plugin for the Nextcloud private cloud software
|
||||
|
||||
OCRmyPDF is not designed to be secure against malware-bearing PDFs (see `Using OCRmyPDF online <ocr-service>`_). Users should ensure they comply with OCRmyPDF's licenses and the licenses of all dependencies. In particular, OCRmyPDF requires Ghostscript, which is licensed under AGPLv3.
|
||||
OCRmyPDF is not designed to be secure against malware-bearing PDFs (see
|
||||
`Using OCRmyPDF online <ocr-service>`__). Users should ensure they
|
||||
comply with OCRmyPDF's licenses and the licenses of all dependencies. In
|
||||
particular, OCRmyPDF requires Ghostscript, which is licensed under
|
||||
AGPLv3.
|
||||
|
||||
.. |image| image:: images/bitmap_vs_svg.svg
|
||||
|
||||
@@ -1,35 +1,55 @@
|
||||
.. _jbig2:
|
||||
|
||||
============================
|
||||
Installing the JBIG2 encoder
|
||||
============================
|
||||
|
||||
Most Linux distributions do not include a JBIG2 encoder since JBIG2 encoding was patented for a long time. All known JBIG2 US patents have expired as of 2017, but it is possible that unknown patents exist.
|
||||
Most Linux distributions do not include a JBIG2 encoder since JBIG2
|
||||
encoding was patented for a long time. All known JBIG2 US patents have
|
||||
expired as of 2017, but it is possible that unknown patents exist.
|
||||
|
||||
JBIG2 encoding is recommended for OCRmyPDF and is used to losslessly create smaller PDFs. If JBIG2 encoding not available, lower quality encodings will be used.
|
||||
JBIG2 encoding is recommended for OCRmyPDF and is used to losslessly
|
||||
create smaller PDFs. If JBIG2 encoding not available, lower quality
|
||||
encodings will be used.
|
||||
|
||||
JBIG2 decoding is not patented and is performed automatically by most PDF viewers. It is widely supported has been part of the PDF specification since 2001.
|
||||
JBIG2 decoding is not patented and is performed automatically by most
|
||||
PDF viewers. It is widely supported has been part of the PDF
|
||||
specification since 2001.
|
||||
|
||||
On macOS, Homebrew packages jbig2enc and OCRmyPDF includes it by default. The Docker image for OCRmyPDF also builds its own JBIG2 encoder from source.
|
||||
On macOS, Homebrew packages jbig2enc and OCRmyPDF includes it by
|
||||
default. The Docker image for OCRmyPDF also builds its own JBIG2 encoder
|
||||
from source.
|
||||
|
||||
For all other Linux, you must build a JBIG2 encoder from source:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
git clone https://github.com/agl/jbig2enc
|
||||
cd jbig2enc
|
||||
./autogen.sh
|
||||
./configure && make
|
||||
[sudo] make install
|
||||
git clone https://github.com/agl/jbig2enc
|
||||
cd jbig2enc
|
||||
./autogen.sh
|
||||
./configure && make
|
||||
[sudo] make install
|
||||
|
||||
.. _jbig2-lossy:
|
||||
|
||||
Lossy mode JBIG2
|
||||
----------------
|
||||
================
|
||||
|
||||
OCRmyPDF provides lossy mode JBIG2 as an advanced feature. Users should `review the technical concerns with JBIG2 in lossy mode <https://abbyy.technology/en:kb:tip:jbig2_compression_and_ocr>`_ and decide if this feature is acceptable for their use case.
|
||||
OCRmyPDF provides lossy mode JBIG2 as an advanced feature. Users should
|
||||
`review the technical concerns with JBIG2 in lossy
|
||||
mode <https://abbyy.technology/en:kb:tip:jbig2_compression_and_ocr>`__
|
||||
and decide if this feature is acceptable for their use case.
|
||||
|
||||
JBIG2 lossy mode does achieve higher compression ratios than any other monochrome (bitonal) compression technology; for large text documents the savings are considerable. JBIG2 lossless still gives great compression ratios and is a major improvement over the older CCITT G4 standard. As explained above, there is some risk of substitution errors.
|
||||
JBIG2 lossy mode does achieve higher compression ratios than any other
|
||||
monochrome (bitonal) compression technology; for large text documents
|
||||
the savings are considerable. JBIG2 lossless still gives great
|
||||
compression ratios and is a major improvement over the older CCITT G4
|
||||
standard. As explained above, there is some risk of substitution errors.
|
||||
|
||||
To turn on JBIG2 lossy mode, add the argument ``--jbig2-lossy``. ``--optimize {1,2,3}`` are necessary for the argument to take effect also required. Also, a JBIG2 encoder must be installed as described in the previous section.
|
||||
To turn on JBIG2 lossy mode, add the argument ``--jbig2-lossy``.
|
||||
``--optimize {1,2,3}`` are necessary for the argument to take effect
|
||||
also required. Also, a JBIG2 encoder must be installed as described in
|
||||
the previous section.
|
||||
|
||||
*Due to an oversight, ocrmypdf v7.0 and v7.1 used lossy mode by default.*
|
||||
*Due to an oversight, ocrmypdf v7.0 and v7.1 used lossy mode by
|
||||
default.*
|
||||
|
||||
@@ -1,16 +1,29 @@
|
||||
.. _lang-packs:
|
||||
|
||||
====================================
|
||||
Installing additional language packs
|
||||
====================================
|
||||
|
||||
OCRmyPDF uses Tesseract for OCR, and relies on its language packs for languages other than English.
|
||||
OCRmyPDF uses Tesseract for OCR, and relies on its language packs for all languages.
|
||||
On most platforms, English is installed with Tesseract by default, but not always.
|
||||
|
||||
Tesseract supports `most languages <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>`_.
|
||||
Tesseract supports `most
|
||||
languages <https://github.com/tesseract-ocr/tesseract/blob/master/doc/tesseract.1.asc#languages>`__.
|
||||
Languages are identified by standardized three-letter codes (called ISO 639-2 Alpha-3).
|
||||
Tesseract's documentation also lists the three-letter code for your language.
|
||||
Some are anglicized, e.g. Spanish is ``spa`` rather than ``esp``, while others
|
||||
are not, e.g. German is ``deu`` and French is ``fra``.
|
||||
|
||||
For Linux users, you can often find packages that provide language packs:
|
||||
After you have installed a language pack, you can use it with ``ocrmypdf -l <language>``,
|
||||
for example ``ocrmypdf -l spa``. For multilingual documents, you can specify
|
||||
all languages to be expected, e.g. ``ocrmypdf -l eng+fra`` for English and French.
|
||||
English is assumed by default unless other language(s) are specified.
|
||||
|
||||
For Linux users, you can often find packages that provide language
|
||||
packs:
|
||||
|
||||
Debian and Ubuntu users
|
||||
-----------------------
|
||||
=======================
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -20,11 +33,13 @@ Debian and Ubuntu users
|
||||
# Install Chinese Simplified language pack
|
||||
apt-get install tesseract-ocr-chi-sim
|
||||
|
||||
You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple
|
||||
languages can be requested using either ``-l eng+fre`` (English and French) or ``-l eng -l fre``.
|
||||
You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as
|
||||
to what languages it should search for. Multiple languages can be
|
||||
requested using either ``-l eng+fra`` (English and French) or
|
||||
``-l eng -l fra``.
|
||||
|
||||
Fedora users
|
||||
------------
|
||||
============
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -34,16 +49,28 @@ Fedora users
|
||||
# Install Chinese Simplified language pack
|
||||
dnf install tesseract-langpack-chi_sim
|
||||
|
||||
You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as to
|
||||
what languages it should search for. Multiple languages can be requested using
|
||||
either ``-l eng+fre`` (English and French) or ``-l eng -l fre``.
|
||||
You can then pass the ``-l LANG`` argument to OCRmyPDF to give a hint as
|
||||
to what languages it should search for. Multiple languages can be
|
||||
requested using either ``-l eng+fra`` (English and French) or
|
||||
``-l eng -l fra``.
|
||||
|
||||
macOS users
|
||||
-----------
|
||||
===========
|
||||
|
||||
You can install additional language packs by :ref:`installing Tesseract using Homebrew with all language packs <macos-all-languages>`.
|
||||
You can install additional language packs by
|
||||
:ref:`installing Tesseract using Homebrew with all language packs <macos-all-languages>`.
|
||||
|
||||
Docker users
|
||||
------------
|
||||
============
|
||||
|
||||
Users of the OCRmyPDF Docker image should install language packs into a derived Docker image as :ref:`described in that section <docker-lang-packs>`.
|
||||
Users of the OCRmyPDF Docker image should install language packs into a
|
||||
derived Docker image as
|
||||
:ref:`described in that section <docker-lang-packs>`.
|
||||
|
||||
Windows users
|
||||
=============
|
||||
|
||||
The Tesseract installer provided by Chocolatey currently includes only English language.
|
||||
To install other languages, download the respective language pack (``.traineddata`` file)
|
||||
from https://github.com/tesseract-ocr/tessdata/ and place it in
|
||||
``C:\\Program Files\\Tesseract-OCR\\tessdata`` (or wherever Tesseract OCR is installed).
|
||||
|
||||
75
docs/optimizer.rst
Normal file
75
docs/optimizer.rst
Normal file
@@ -0,0 +1,75 @@
|
||||
================
|
||||
PDF optimization
|
||||
================
|
||||
|
||||
OCRmyPDF includes an image-oriented PDF optimizer. By default, the optimizer
|
||||
runs with safe settings with the goal of improving compression at no loss of
|
||||
quality. At higher optimization levels, lossy optimizations may be applied and
|
||||
tuned. Optimization occurs after OCR, and only if OCR succeeded. It does not
|
||||
perform other possible optimizations such as deduplicating resources,
|
||||
consolidating fonts, simplifying vector drawings, or anything of that nature.
|
||||
|
||||
Optimization ranges from ``-O0`` through ``-O3``, where ``0`` disables
|
||||
optimization and ``3`` implements all options. ``1``, the default, performs only
|
||||
safe and lossless optimizations. (This is similar to GCC's optimization
|
||||
parameter.) The exact type of optimizations performed will vary over time.
|
||||
|
||||
PDF optimization requires third-party, optional tools for certain optimizations.
|
||||
If these are not installed or cannot be found by OCRmyPDF, optimization will not
|
||||
be as good.
|
||||
|
||||
Optimizations that always occurs
|
||||
================================
|
||||
|
||||
OCRmyPDF will automatically replace obsolete or inferior compression schemes
|
||||
such as RLE or LZW with superior schemes such as Deflate and converting
|
||||
monochrome images to CCITT G4. Since this is harmless it always occurs and there
|
||||
is no way to disable it. Other non-image compressed objects are compressed as
|
||||
well.
|
||||
|
||||
Fast web view
|
||||
=============
|
||||
|
||||
OCRmyPDF automatically optimizes PDFs for "fast web view" in Adobe Acrobat's
|
||||
parlance, or equivalently, linearizes PDFs so that the resources they reference
|
||||
are presented in the order a viewer needs them for sequential display. This
|
||||
reduces the latency of viewing a PDF both online and from local storage. This
|
||||
actually slightly increases the file size.
|
||||
|
||||
To disable this optimization and all others, use ``ocrmypdf --optimize 0 ...``
|
||||
or the shorthand ``-O0``.
|
||||
|
||||
Lossless optimizations
|
||||
======================
|
||||
|
||||
At optimization level ``-O1`` (the default), OCRmyPDF will also attempt lossless
|
||||
image optimization.
|
||||
|
||||
If a JBIG2 encoder is available, then monochrome images will be converted to
|
||||
JBIG2, with the potential for huge savings on large black and white images,
|
||||
since JBIG2 is far more efficient than any other monochrome (bi-level)
|
||||
compression. (All known US patents related to JBIG2 have probably expired, but
|
||||
it remains the responsibility of the user to supply a JBIG2 encoder such as
|
||||
`jbig2enc <https://github.com/agl/jbig2enc>`__. OCRmyPDF does not implement
|
||||
JBIG2 encoding on its own.)
|
||||
|
||||
OCRmyPDF currently does not attempt to recompress losslessly compressed objects
|
||||
more aggressively.
|
||||
|
||||
Lossy optimizations
|
||||
===================
|
||||
|
||||
At optimization level ``-O2`` and ``-O3``, OCRmyPDF will some attempt lossy
|
||||
image optimization.
|
||||
|
||||
If ``pngquant`` is installed, OCRmyPDF will use it to perform quantize paletted
|
||||
images to reduce their size.
|
||||
|
||||
The quality of JPEGs may be lowered, on the assumption that a lower quality
|
||||
image may be suitable for storage after OCR.
|
||||
|
||||
It is not possible to optimize all image types. Uncommon image types may be
|
||||
skipped by the optimizer.
|
||||
|
||||
OCRmyPDF provides :ref:`lossy mode JBIG2 <jbig2-lossy>` as an advanced feature
|
||||
that additional requires the argument ``--jbig2-lossy``.
|
||||
161
docs/pdfsecurity.rst
Normal file
161
docs/pdfsecurity.rst
Normal file
@@ -0,0 +1,161 @@
|
||||
===================
|
||||
PDF security issues
|
||||
===================
|
||||
|
||||
OCRmyPDF should only be used on PDFs you trust. It is not designed to
|
||||
protect you against malware.
|
||||
|
||||
Recognizing that many users have an interest in handling PDFs and
|
||||
applying OCR to PDFs they did not generate themselves, this article
|
||||
discusses the security implications of PDFs and how users can protect
|
||||
themselves.
|
||||
|
||||
The disclaimer applies: this software has no warranties of any kind.
|
||||
|
||||
PDFs may contain malware
|
||||
========================
|
||||
|
||||
PDF is a rich, complex file format. The official PDF 1.7 specification,
|
||||
ISO 32000:2008, is hundreds of pages long and references several annexes
|
||||
each of which are similar in length. PDFs can contain video, audio, XML,
|
||||
JavaScript and other programming, and forms. In some cases, they can
|
||||
open internet connections to pre-selected URLs. All of these possible
|
||||
attack vectors.
|
||||
|
||||
In short, PDFs `may contain
|
||||
viruses <https://security.stackexchange.com/questions/64052/can-a-pdf-file-contain-a-virus>`__.
|
||||
|
||||
This
|
||||
`article <https://theinvisiblethings.blogspot.ca/2013/02/converting-untrusted-pdfs-into-trusted.html>`__
|
||||
describes a high-paranoia method which allows potentially hostile PDFs
|
||||
to be viewed and rasterized safely in a disposable virtual machine. A
|
||||
trusted PDF created in this manner is converted to images and loses all
|
||||
information making it searchable and losing all compression. OCRmyPDF
|
||||
could be used restore searchability.
|
||||
|
||||
How OCRmyPDF processes PDFs
|
||||
===========================
|
||||
|
||||
OCRmyPDF must open and interpret your PDF in order to insert an OCR
|
||||
layer. First, it runs all PDFs through
|
||||
`pikepdf <https://github.com/pikepdf/pikepdf>`__, a library based on
|
||||
`qpdf <https://github.com/qpdf/qpdf>`__, a program that repairs PDFs
|
||||
with syntax errors. This is done because, in the author's experience, a
|
||||
significant number of PDFs in the wild especially those created by
|
||||
scanners are not well-formed files. qpdf makes it more likely that
|
||||
OCRmyPDF will succeed, but offers no security guarantees. qpdf is also
|
||||
used to split the PDF into single page PDFs.
|
||||
|
||||
Finally, OCRmyPDF rasterizes each page of the PDF using
|
||||
`Ghostscript <http://ghostscript.com/>`__ in ``-dSAFER`` mode.
|
||||
|
||||
Depending on the options specified, OCRmyPDF may graft the OCR layer
|
||||
into the existing PDF or it may essentially reconstruct ("re-fry") a
|
||||
visually identical PDF that may be quite different at the binary level.
|
||||
That said, OCRmyPDF is not a tool designed for sanitizing PDFs.
|
||||
|
||||
.. _ocr-service:
|
||||
|
||||
Using OCRmyPDF online or as a service
|
||||
=====================================
|
||||
|
||||
OCRmyPDF is not designed for use as a public web service where a
|
||||
malicious user could upload a chosen PDF. In particular, it is not
|
||||
necessarily secure against PDF malware or PDFs that cause denial of
|
||||
service. OCRmyPDF relies on Ghostscript, and therefore, if deployed
|
||||
online one should be prepared to comply with Ghostscript's Affero GPL
|
||||
license, and any other licenses.
|
||||
|
||||
Setting aside these concerns, a side effect of OCRmyPDF is it may
|
||||
incidentally sanitize PDFs that contain certain types of malware. It
|
||||
repairs the PDF with pikepdf/libqpdf, which could correct malformed PDF
|
||||
structures that are part of an attack. When PDF/A output is selected
|
||||
(the default), the input PDF is partially reconstructed by Ghostscript.
|
||||
When ``--force-ocr`` is used, all pages are rasterized and reconverted
|
||||
to PDF, which could remove malware in embedded images.
|
||||
|
||||
OCRmyPDF should be relatively safe to use in a trusted intranet, with
|
||||
some considerations:
|
||||
|
||||
Limiting CPU usage
|
||||
------------------
|
||||
|
||||
OCRmyPDF will attempt to use all available CPUs and storage, so
|
||||
executing ``nice ocrmypdf`` or limiting the number of jobs with the
|
||||
``-j`` argument may ensure the server remains available. Another option
|
||||
would be run OCRmyPDF jobs inside a Docker container, a virtual machine,
|
||||
or a cloud instance, which can impose its own limits on CPU usage and be
|
||||
terminated "from orbit" if it fails to complete.
|
||||
|
||||
Temporary storage requirements
|
||||
------------------------------
|
||||
|
||||
OCRmyPDF will use a large amount of temporary storage for its work,
|
||||
proportional to the total number of pixels needed to rasterize the PDF.
|
||||
The raster image of a 8.5×11" color page at 300 DPI takes 25 MB
|
||||
uncompressed; OCRmyPDF saves its intermediates as PNG, but that still
|
||||
means it requires about 9 MB per intermediate based on average
|
||||
compression ratios. Multiple intermediates per page are also required,
|
||||
depending on the command line given. A rule of thumb would be to allow
|
||||
100 MB of temporary storage per page in a file – meaning that a small
|
||||
cloud servers or small VM partitions should be provisioned with plenty
|
||||
of extra space, if say, a 500 page file might be sent.
|
||||
|
||||
To check temporary storage usage on actual files, run
|
||||
``ocrmypdf -k ...`` which will preserve and print the path to temporary
|
||||
storage when the job is done.
|
||||
|
||||
To change where temporary files are stored, change the ``TMPDIR``
|
||||
environment variable for ocrmypdf's environment. (Python's
|
||||
``tempfile.gettempdir()`` returns the root directory in which temporary
|
||||
files will be stored.) For example, one could redirect ``TMPDIR`` to a
|
||||
large RAM disk to avoid wear on HDD/SSD and potentially improve
|
||||
performance. On Amazon Web Services, ``TMPDIR`` can be set to `empheral
|
||||
storage <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html>`__.
|
||||
|
||||
Timeouts
|
||||
--------
|
||||
|
||||
To prevent excessively long OCR jobs consider setting
|
||||
``--tesseract-timeout`` and/or ``--skip-big`` arguments. ``--skip-big``
|
||||
is particularly helpful if your PDFs include documents such as reports
|
||||
on standard page sizes with large images attached - often large images
|
||||
are not worth OCR'ing anyway.
|
||||
|
||||
Commercial alternatives
|
||||
-----------------------
|
||||
|
||||
The author also provides professional services that include OCR and
|
||||
building databases around PDFs, and is happy to provide consultation.
|
||||
|
||||
Abbyy Cloud OCR is a viable commercial alternative with a web services
|
||||
API.
|
||||
|
||||
Password protection, digital signatures and certification
|
||||
=========================================================
|
||||
|
||||
Password protected PDFs usually have two passwords, and owner and user
|
||||
password. When the user password is set to empty, PDF readers will open
|
||||
the file automatically and marked it as "(SECURED)". While not as
|
||||
reliable as a digital signature, this indicates that whoever set the
|
||||
password approved of the file at that time. When the user password is
|
||||
set, the document cannot be viewed without the password.
|
||||
|
||||
Either way, OCRmyPDF does not remove passwords from PDFs and exits with
|
||||
an error on encountering them.
|
||||
|
||||
``qpdf`` can remove passwords. If the owner and user password are set, a
|
||||
password is required for ``qpdf``. If only the owner password is set, then the
|
||||
password can be stripped, even if one does not have the owner password.
|
||||
|
||||
After OCR is applied, password protection is not permitted on PDF/A
|
||||
documents but the file can be converted to regular PDF.
|
||||
|
||||
Many programs exist which are capable of inserting an image of someone's
|
||||
signature. On its own, this offers no security guarantees. It is trivial
|
||||
to remove the signature image and apply it to other files. This practice
|
||||
offers no real security.
|
||||
|
||||
Important documents can be digitally signed and certified to attest to
|
||||
their authorship. OCRmyPDF cannot do this. Open source tools such as
|
||||
pdfbox (Java) have this capability as does Adobe Acrobat.
|
||||
22
docs/performance.rst
Normal file
22
docs/performance.rst
Normal file
@@ -0,0 +1,22 @@
|
||||
===========
|
||||
Performance
|
||||
===========
|
||||
|
||||
Some users have noticed that current versions of OCRmyPDF do not run as quickly
|
||||
as some older versions (specifically 6.x and older). This is because OCRmyPDF
|
||||
added image optimization as a postprocessing step, and it is enabled by default.
|
||||
|
||||
Speed
|
||||
=====
|
||||
|
||||
If running OCRmyPDF quickly is your main goal, you can use settings such as:
|
||||
|
||||
* ``--optimize 0`` to disable file size optimization
|
||||
* ``--output-type pdf`` to disable PDF/A generation
|
||||
* ``--fast-web-view 0`` to disable fast web view optimization
|
||||
* ``--skip-big`` to skip large images, if some pages have large images
|
||||
|
||||
You can also avoid:
|
||||
|
||||
* ``--force-ocr``
|
||||
* Image preprocessing
|
||||
200
docs/plugins.rst
Normal file
200
docs/plugins.rst
Normal file
@@ -0,0 +1,200 @@
|
||||
=======
|
||||
Plugins
|
||||
=======
|
||||
|
||||
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL
|
||||
NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and
|
||||
"OPTIONAL" in this document are to be interpreted as described in
|
||||
RFC 2119.
|
||||
|
||||
You can use plugins to customize the behavior of OCRmyPDF at certain points of
|
||||
interest.
|
||||
|
||||
Currently, it is possible to:
|
||||
|
||||
- add new command line arguments
|
||||
- override the decision for whether or not to perform OCR on a particular file
|
||||
- modify the image is about to be sent for OCR
|
||||
- modify the page image before it is converted to PDF
|
||||
- replace the Tesseract OCR with another OCR engine that has similar behavior
|
||||
- replace Ghostscript with another PDF to image converter (rasterizer) or
|
||||
PDF/A generator
|
||||
|
||||
OCRmyPDF plugins are based on the Python ``pluggy`` package and conform to its
|
||||
conventions. Note that: plugins installed with as setuptools entrypoints are
|
||||
not checked currently, because OCRmyPDF assumes you may not want to enable
|
||||
plugins for all files.
|
||||
|
||||
Script plugins
|
||||
==============
|
||||
|
||||
Script plugins may be called from the command line, by specifying the name of a file.
|
||||
Script plugins may be convenient for informal or "one-off" plugins, when a certain
|
||||
batch of files needs a special processing step for example.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --plugin ocrmypdf_example_plugin.py input.pdf output.pdf
|
||||
|
||||
Multiple plugins may be installed by issuing the ``--plugin`` argument multiple times.
|
||||
|
||||
Packaged plugins
|
||||
================
|
||||
|
||||
Installed plugins may be installed into the same virtual environment as OCRmyPDF
|
||||
is installed into. They may be invoked using Python standard module naming.
|
||||
If you are intending to distribute a plugin, please package it.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --plugin ocrmypdf_fancypants.pockets.contents input.pdf output.pdf
|
||||
|
||||
OCRmyPDF does not automatically import plugins, because the assumption is that
|
||||
plugins affect different files differently and you may not want them activated
|
||||
all the time. The command line or ``ocrmypdf.ocr(plugin='...')`` must call
|
||||
for them.
|
||||
|
||||
Third parties that wish to distribute packages for ocrmypdf should package them
|
||||
as packaged plugins, and these modules should begin with the name ``ocrmypdf_``
|
||||
similar to ``pytest`` packages such as ``pytest-cov`` (the package) and
|
||||
``pytest_cov`` (the module).
|
||||
|
||||
.. note::
|
||||
|
||||
We strongly recommend plugin authors name their plugins with the prefix
|
||||
``ocrmypdf-`` (for the package name on PyPI) and ``ocrmypdf_`` (for the
|
||||
module), just like pytest plugins.
|
||||
|
||||
Setuptools plugins
|
||||
==================
|
||||
|
||||
You can also create a plugin that OCRmyPDF will always automatically load if both are
|
||||
installed in the same virtual environment, using a setuptools entrypoint.
|
||||
|
||||
Your package's ``setup.py`` would need to contain the following, for a plugin
|
||||
named ``ocrmypdf-exampleplugin``:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
# sample ./setup.py file
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="ocrmypdf-exampleplugin",
|
||||
packages=["exampleplugin"],
|
||||
# the following makes a plugin available to pytest
|
||||
entry_points={"ocrmypdf": ["exampleplugin = exampleplugin.pluginmodule"]},
|
||||
)
|
||||
|
||||
Plugin requirements
|
||||
===================
|
||||
|
||||
OCRmyPDF generally uses multiple worker processes. When a new worker is started,
|
||||
Python will import all plugins again, including all plugins that were imported earlier.
|
||||
This means that the global state of a plugin in one worker will not be shared with
|
||||
other workers. As such, plugin hook implementations should be stateless, relying
|
||||
only on their inputs. Hook implementations may use their input parameters to
|
||||
to obtain a reference to shared state prepared by another hook implementation.
|
||||
Plugins must expect that other instances of the plugin will be running
|
||||
simultaneously.
|
||||
|
||||
The ``context`` object that is passed to many hooks can be used to share information
|
||||
about a file being worked on. Plugins must write private, plugin-specific data to
|
||||
a subfolder named ``{options.work_folder}/ocrmypdf-plugin-name``. Plugins MAY
|
||||
read and write files in ``options.work_folder``, but should be aware that their
|
||||
semantics are subject to change.
|
||||
|
||||
OCRmyPDF will delete ``options.work_folder`` when it has finished OCRing
|
||||
a file, unless invoked with ``--keep-temporary-files``.
|
||||
|
||||
The documentation for some plugin hooks contain a detailed description of the
|
||||
execution context in which they will be called.
|
||||
|
||||
Plugins should be prepared to work whether executed in worker threads or worker
|
||||
processes. Generally, OCRmyPDF uses processes, but has a semi-hidden threaded
|
||||
argument that simplifies debugging.
|
||||
|
||||
|
||||
Plugin hooks
|
||||
============
|
||||
|
||||
A plugin may provide the following hooks. Hooks must be decorated with
|
||||
``ocrmypdf.hookimpl``, for example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ocrmpydf import hookimpl
|
||||
|
||||
@hookimpl
|
||||
def add_options(parser):
|
||||
pass
|
||||
|
||||
The following is a complete list of hooks that are available, and when
|
||||
they are called.
|
||||
|
||||
.. _firstresult:
|
||||
|
||||
**Note on firstresult hooks**
|
||||
|
||||
If multiple plugins install implementations for this hook, they will be called in
|
||||
the reverse of the order in which they are installed (i.e., last plugin wins).
|
||||
When each hook implementation is called in order, the first implementation that
|
||||
returns a value other than ``None`` will "win" and prevent execution of all other
|
||||
hooks. As such, you cannot "chain" a series of plugin filters together in this
|
||||
way. Instead, a single hook implementation should be responsible for any such
|
||||
chaining operations.
|
||||
|
||||
Custom command line arguments
|
||||
-----------------------------
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.add_options
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.check_options
|
||||
|
||||
Execution and progress reporting
|
||||
--------------------------------
|
||||
|
||||
.. autoclass: ocrmypdf.pluginspec.Executor
|
||||
:members:
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.get_logging_console
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.get_executor
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.get_progressbar_class
|
||||
|
||||
Applying special behavior before processing
|
||||
-------------------------------------------
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.validate
|
||||
|
||||
PDF page to image
|
||||
-----------------
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.rasterize_pdf_page
|
||||
|
||||
Modifying intermediate images
|
||||
-----------------------------
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.filter_ocr_image
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.filter_page_image
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.filter_pdf_page
|
||||
|
||||
OCR engine
|
||||
----------
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.get_ocr_engine
|
||||
|
||||
.. autoclass:: ocrmypdf.pluginspec.OcrEngine
|
||||
:members:
|
||||
|
||||
.. automethod:: __str__
|
||||
|
||||
.. autoclass:: ocrmypdf.pluginspec.OrientationConfidence
|
||||
|
||||
PDF/A production
|
||||
----------------
|
||||
|
||||
.. autofunction:: ocrmypdf.pluginspec.generate_pdfa
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,79 +0,0 @@
|
||||
PDF security issues
|
||||
===================
|
||||
|
||||
OCRmyPDF should only be used on PDFs you trust. It is not designed to protect you against malware.
|
||||
|
||||
Recognizing that many users have an interest in handling PDFs and applying OCR to PDFs they did not generate themselves, this article discusses the security implications of PDFs and how users can protect themselves.
|
||||
|
||||
The disclaimer applies: this software has no warranties of any kind.
|
||||
|
||||
PDFs may contain malware
|
||||
------------------------
|
||||
|
||||
PDF is a rich, complex file format. The official PDF 1.7 specification, ISO 32000:2008, is hundreds of pages long and references several annexes each of which are similar in length. PDFs can contain video, audio, XML, JavaScript and other programming, and forms. In some cases, they can open internet connections to pre-selected URLs. All of these possible attack vectors.
|
||||
|
||||
In short, PDFs `may contain viruses <https://security.stackexchange.com/questions/64052/can-a-pdf-file-contain-a-virus>`_.
|
||||
|
||||
This `article <https://theinvisiblethings.blogspot.ca/2013/02/converting-untrusted-pdfs-into-trusted.html>`_ describes a high-paranoia method which allows potentially hostile PDFs to be viewed and rasterized safely in a disposable virtual machine. A trusted PDF created in this manner is converted to images and loses all information making it searchable and losing all compression. OCRmyPDF could be used restore searchability.
|
||||
|
||||
How OCRmyPDF processes PDFs
|
||||
---------------------------
|
||||
|
||||
OCRmyPDF must open and interpret your PDF in order to insert an OCR layer. First, it runs all PDFs through `pikepdf <https://github.com/pikepdf/pikepdf>`_, a library based on `qpdf <https://github.com/qpdf/qpdf>`_, a program that repairs PDFs with syntax errors. This is done because, in the author's experience, a significant number of PDFs in the wild especially those created by scanners are not well-formed files. qpdf makes it more likely that OCRmyPDF will succeed, but offers no security guarantees. qpdf is also used to split the PDF into single page PDFs.
|
||||
|
||||
Finally, OCRmyPDF rasterizes each page of the PDF using `Ghostscript <http://ghostscript.com/>`_ in ``-dSAFER`` mode.
|
||||
|
||||
Depending on the options specified, OCRmyPDF may graft the OCR layer into the existing PDF or it may essentially reconstruct ("re-fry") a visually identical PDF that may be quite different at the binary level. That said, OCRmyPDF is not a tool designed for sanitizing PDFs.
|
||||
|
||||
.. _ocr-service:
|
||||
|
||||
Using OCRmyPDF online or as a service
|
||||
-------------------------------------
|
||||
|
||||
OCRmyPDF is not designed for use as a public web service where a malicious user could upload a chosen PDF. In particular, it is not necessarily secure against PDF malware or PDFs that cause denial of service. OCRmyPDF relies on Ghostscript, and therefore, if deployed online one should be prepared to comply with Ghostscript's Affero GPL license, OCRmyPDF's GPL license, and any other licenses.
|
||||
|
||||
Setting aside these concerns, a side effect of OCRmyPDF is it may incidentally sanitize PDFs that contain certain types of malware. It runs ``qpdf`` to repair the PDF, which could correct malformed PDF structures that are part of an attack. When PDF/A output is selected (the default), the input PDF is partially reconstructed by Ghostscript. When ``--force-ocr`` is used, all pages are rasterized and reconverted to PDF, which could remove malware in embedded images.
|
||||
|
||||
OCRmyPDF should be relatively safe to use in a trusted intranet, with some considerations:
|
||||
|
||||
Limiting CPU usage
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
OCRmyPDF will attempt to use all available CPUs and storage, so executing ``nice ocrmypdf`` or limiting the number of jobs with the ``-j`` argument may ensure the server remains available. Another option would be run OCRmyPDF jobs inside a Docker container, a virtual machine, or a cloud instance, which can impose its own limits on CPU usage and be terminated "from orbit" if it fails to complete.
|
||||
|
||||
Temporary storage requirements
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
OCRmyPDF will use a large amount of temporary storage for its work, proportional to the total number of pixels needed to rasterize the PDF. The raster image of a 8.5×11" color page at 300 DPI takes 25 MB uncompressed; OCRmyPDF saves its intermediates as PNG, but that still means it requires about 9 MB per intermediate based on average compression ratios. Multiple intermediates per page are also required, depending on the command line given. A rule of thumb would be to allow 100 MB of temporary storage per page in a file – meaning that a small cloud servers or small VM partitions should be provisioned with plenty of extra space, if say, a 500 page file might be sent.
|
||||
|
||||
To check temporary storage usage on actual files, run ``ocrmypdf -k ...`` which will preserve and print the path to temporary storage when the job is done.
|
||||
|
||||
To change where temporary files are stored, change the ``TMPDIR`` environment variable for ocrmypdf's environment. (Python's ``tempfile.gettempdir()`` returns the root directory in which temporary files will be stored.) For example, one could redirect ``TMPDIR`` to a large RAM disk to avoid wear on HDD/SSD and potentially improve performance. On Amazon Web Services, ``TMPDIR`` can be set to `empheral storage <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html>`_.
|
||||
|
||||
Timeouts
|
||||
^^^^^^^^
|
||||
|
||||
To prevent excessively long OCR jobs consider setting ``--tesseract-timeout`` and/or ``--skip-big`` arguments. ``--skip-big`` is particularly helpful if your PDFs include documents such as reports on standard page sizes with large images attached - often large images are not worth OCR'ing anyway.
|
||||
|
||||
Commercial alternatives
|
||||
^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The author also provides professional services that include OCR and building databases around PDFs, and is happy to provide consultation.
|
||||
|
||||
Abbyy Cloud OCR is a viable commercial alternative with a web services API.
|
||||
|
||||
|
||||
Password protection, digital signatures and certification
|
||||
---------------------------------------------------------
|
||||
|
||||
Password protected PDFs usually have two passwords, and owner and user password. When the user password is set to empty, PDF readers will open the file automatically and marked it as "(SECURED)". While not as reliable as a digital signature, this indicates that whoever set the password approved of the file at that time. When the user password is set, the document cannot be viewed without the password.
|
||||
|
||||
Either way, OCRmyPDF does not remove passwords from PDFs and exits with an error on encountering them.
|
||||
|
||||
``qpdf``, one of OCRmyPDF's dependencies, can remove passwords. If the owner and user password are set, a password is required for ``qpdf``. If only the owner password is set, then the password can be stripped, even if one does not have the owner password.
|
||||
|
||||
After OCR is applied, password protection is not permitted on PDF/A documents but the file can be converted to regular PDF.
|
||||
|
||||
Many programs exist which are capable of inserting an image of someone's signature. On its own, this offers no security guarantees. It is trivial to remove the signature image and apply it to other files. This practice offers no real security.
|
||||
|
||||
Important documents can be digitally signed and certified to attest to their authorship. OCRmyPDF cannot do this. Open source tools such as pdfbox (Java) have this capability as does Adobe Acrobat.
|
||||
68
misc/batch.py
Normal file
68
misc/batch.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2016 findingorder: https://github.com/findingorder
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
# This script must be edited to meet your needs.
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
import ocrmypdf
|
||||
|
||||
# pylint: disable=logging-format-interpolation
|
||||
# pylint: disable=logging-not-lazy
|
||||
|
||||
script_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
print(script_dir + '/batch.py: Start')
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
start_dir = sys.argv[1]
|
||||
else:
|
||||
start_dir = '.'
|
||||
|
||||
if len(sys.argv) > 2:
|
||||
log_file = sys.argv[2]
|
||||
else:
|
||||
log_file = script_dir + '/ocr-tree.log'
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s %(message)s',
|
||||
filename=log_file,
|
||||
filemode='w',
|
||||
)
|
||||
|
||||
ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)
|
||||
|
||||
for dir_name, subdirs, file_list in os.walk(start_dir):
|
||||
logging.info(dir_name + '\n')
|
||||
os.chdir(dir_name)
|
||||
for filename in file_list:
|
||||
file_ext = os.path.splitext(filename)[1]
|
||||
if file_ext == '.pdf':
|
||||
full_path = dir_name + '/' + filename
|
||||
print(full_path)
|
||||
result = ocrmypdf.ocr(filename, filename, deskew=True)
|
||||
if result == ocrmypdf.ExitCode.already_done_ocr:
|
||||
print("Skipped document because it already contained text")
|
||||
elif result == ocrmypdf.ExitCode.ok:
|
||||
print("OCR complete")
|
||||
logging.info(result)
|
||||
@@ -1,9 +1,58 @@
|
||||
# ocrmypdf completion -*- shell-script -*-
|
||||
|
||||
# Copyright 2019 Frank Pille
|
||||
# Copyright 2020 Alex Willner
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
set -o errexit
|
||||
|
||||
_ocrmypdf()
|
||||
{
|
||||
local cur prev cword words split
|
||||
_init_completion -s || return
|
||||
|
||||
# Homebrew on Macs have version 1.3 of bash-completion which doesn't include - see #502
|
||||
if declare -F _init_completions >/dev/null 2>&1; then
|
||||
_init_completion -s || return
|
||||
else
|
||||
COMPREPLY=()
|
||||
_get_comp_words_by_ref cur prev words cword
|
||||
fi
|
||||
|
||||
if [[ $cur == -* ]]; then
|
||||
COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
|
||||
--sidecar --version --jobs --quiet --verbose --title --author
|
||||
--subject --keywords --rotate-pages --remove-background --deskew
|
||||
--clean --clean-final --unpaper-args --oversample --remove-vectors
|
||||
--threshold --force-ocr --skip-text --redo-ocr
|
||||
--skip-big --jpeg-quality --png-quality --jbig2-lossy
|
||||
--max-image-mpixels --tesseract-config --tesseract-pagesegmode
|
||||
--help --tesseract-oem --pdf-renderer --tesseract-timeout
|
||||
--rotate-pages-threshold --pdfa-image-compression --user-words
|
||||
--user-patterns --keep-temporary-files --output-type
|
||||
--no-progress-bar --pages --fast-web-view' \
|
||||
-- "$cur" ) )
|
||||
return
|
||||
else
|
||||
_filedir
|
||||
return
|
||||
fi
|
||||
|
||||
case $prev in
|
||||
--version|-h|--help)
|
||||
@@ -49,39 +98,23 @@ _ocrmypdf()
|
||||
return
|
||||
;;
|
||||
-v|--verbose)
|
||||
COMPREPLY=( $( compgen -W '{1..9}' -- "$cur" ) ) # max level ?
|
||||
COMPREPLY=( $( compgen -W '{0..2}' -- "$cur" ) ) # max level ?
|
||||
return
|
||||
;;
|
||||
--tesseract-pagesegmode)
|
||||
COMPREPLY=( $( compgen -W '{1..13}' -- "$cur" ) )
|
||||
return
|
||||
;;
|
||||
--sidecar|--title|--author|--subject|--keywords|--unpaper-args)
|
||||
--sidecar|--title|--author|--subject|--keywords|--unpaper-args|--pages|--fast-web-view)
|
||||
# argument required but no completions available
|
||||
return
|
||||
;;
|
||||
esac
|
||||
|
||||
$split && return
|
||||
|
||||
if [[ $cur == -* ]]; then
|
||||
COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
|
||||
--sidecar --version --jobs --quiet --verbose --title --author
|
||||
--subject --keywords --rotate-pages --remove-background --deskew
|
||||
--clean --clean-final --unpaper-args --oversample --remove-vectors
|
||||
--mask-barcodes --threshold --force-ocr --skip-text --redo-ocr
|
||||
--skip-big --jpeg-quality --png-quality --jbig2-lossy
|
||||
--max-image-mpixels --tesseract-config --tesseract-pagesegmode
|
||||
--help --tesseract-oem --pdf-renderer --tesseract-timeout
|
||||
--rotate-pages-threshold --pdfa-image-compression --user-words
|
||||
--user-patterns --keep-temporary-files --flowchart --output-type' \
|
||||
-- "$cur" ) )
|
||||
return
|
||||
else
|
||||
_filedir
|
||||
return
|
||||
fi
|
||||
} &&
|
||||
complete -F _ocrmypdf ocrmypdf
|
||||
|
||||
set +o errexit
|
||||
|
||||
# ex: filetype=sh
|
||||
|
||||
@@ -1,15 +1,34 @@
|
||||
complete -c ocrmypdf -l version
|
||||
complete -c ocrmypdf -l help
|
||||
# Copyright 2020 James R. Barlow
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
complete -c ocrmypdf -l sidecar -r -d "write OCR to text file"
|
||||
complete -c ocrmypdf -s q -l quiet
|
||||
complete -c ocrmypdf -x -n '__fish_is_first_arg' -l version
|
||||
complete -c ocrmypdf -x -n '__fish_is_first_arg' -s h -s "?" -l help
|
||||
|
||||
complete -c ocrmypdf -r -l sidecar -d "write OCR to text file"
|
||||
complete -c ocrmypdf -x -s q -l quiet
|
||||
|
||||
complete -c ocrmypdf -s r -l rotate-pages -d "rotate pages to correct orientation"
|
||||
complete -c ocrmypdf -s d -l deskew -d "fix small horizontal alignment skew"
|
||||
complete -c ocrmypdf -s c -l clean -d "clean document images before OCR"
|
||||
complete -c ocrmypdf -s i -l clean-final -d "clean document images and keep result"
|
||||
complete -c ocrmypdf -l remove-vectors -d "don't send vector objects to OCR"
|
||||
complete -c ocrmypdf -l mask-barcodes -d "mask barcodes from OCR"
|
||||
complete -c ocrmypdf -l threshold -d "threshold images before OCR"
|
||||
|
||||
complete -c ocrmypdf -s f -l force-ocr -d "OCR documents that already have printable text"
|
||||
@@ -18,8 +37,14 @@ complete -c ocrmypdf -l redo-ocr -d "redo OCR on any pages that seem to have OCR
|
||||
|
||||
complete -c ocrmypdf -s k -l keep-temporary-files -d "keep temporary files (debug)"
|
||||
|
||||
complete -c ocrmypdf -x -s l -l language -d 'language'
|
||||
complete -c ocrmypdf -x -s l -l language -a '(tesseract --list-langs)'
|
||||
function __fish_ocrmypdf_languages
|
||||
set langs (tesseract --list-langs ^/dev/null)
|
||||
set arr (string split '\n' $langs)
|
||||
for lang in $arr[2..-1]
|
||||
echo $lang
|
||||
end
|
||||
end
|
||||
complete -c ocrmypdf -x -s l -l language -a '(__fish_ocrmypdf_languages)' -d "language"
|
||||
|
||||
complete -c ocrmypdf -x -l image-dpi -d "assume this DPI if input image DPI is unknown"
|
||||
|
||||
@@ -34,10 +59,11 @@ complete -c ocrmypdf -x -l output-type -a '(__fish_ocrmypdf_output_type)' -d "se
|
||||
|
||||
function __fish_ocrmypdf_pdf_renderer
|
||||
echo -e "auto\t"(_ "auto select PDF renderer")
|
||||
echo -e "hocr\t"(_ "use hocr renderer")
|
||||
echo -e "hocr\t"(_ "use hOCR renderer")
|
||||
echo -e "hocrdebug\t"(_ "uses hOCR renderer in debug mode, showing recognized text")
|
||||
echo -e "sandwich\t"(_ "use sandwich renderer")
|
||||
end
|
||||
complete -c ocrmypdf -x -l pdf-render -a '(__fish_ocrmypdf_pdf_renderer)' -d "select PDF renderer options"
|
||||
complete -c ocrmypdf -x -l pdf-renderer -a '(__fish_ocrmypdf_pdf_renderer)' -d "select PDF renderer options"
|
||||
|
||||
function __fish_ocrmypdf_optimize
|
||||
echo -e "0\t"(_ "do not optimize")
|
||||
@@ -47,8 +73,23 @@ function __fish_ocrmypdf_optimize
|
||||
end
|
||||
complete -c ocrmypdf -x -s O -l optimize -a '(__fish_ocrmypdf_optimize)' -d "select optimization level"
|
||||
|
||||
function __fish_ocrmypdf_verbose
|
||||
echo -e "0\t"(_ "standard output messages")
|
||||
echo -e "1\t"(_ "troubleshooting output messages")
|
||||
echo -e "2\t"(_ "debugging output messages")
|
||||
end
|
||||
complete -c ocrmypdf -x -s v -l verbose -a '(__fish_ocrmypdf_verbose)' -d "set verbosity level"
|
||||
|
||||
complete -c ocrmypdf -x -l no-progress-bar -d "disable the progress bar"
|
||||
|
||||
function __fish_ocrmypdf_pdfa_compression
|
||||
echo -e "auto\t"(_ "let Ghostscript decide how to compress images")
|
||||
echo -e "jpeg\t"(_ "convert color and grayscale images to JPEG")
|
||||
echo -e "lossless\t"(_ "convert color and grayscale images to lossless (PNG)")
|
||||
end
|
||||
complete -c ocrmypdf -x -l pdfa-image-compression -a '(__fish_ocrmypdf_pdfa_compression)' -d "set PDF/A image compression options"
|
||||
|
||||
complete -c ocrmypdf -x -s j -l jobs -d "how many worker processes to use"
|
||||
complete -c ocrmypdf -x -s v -a '(seq 1 9)'
|
||||
complete -c ocrmypdf -x -l title -d "set metadata"
|
||||
complete -c ocrmypdf -x -l author -d "set metadata"
|
||||
complete -c ocrmypdf -x -l subject -d "set metadata"
|
||||
@@ -60,11 +101,39 @@ complete -c ocrmypdf -x -l jpeg-quality -d "JPEG quality [0..100]"
|
||||
complete -c ocrmypdf -x -l png-quality -d "PNG quality [0..100]"
|
||||
complete -c ocrmypdf -x -l jbig2-lossy -d "enable lossy JBIG2 (see docs)"
|
||||
complete -c ocrmypdf -x -l max-image-mpixels -d "image decompression bomb threshold"
|
||||
complete -c ocrmypdf -x -l pages -d "apply OCR to only the specified pages"
|
||||
complete -c ocrmypdf -x -l tesseract-config -d "set custom tesseract config file"
|
||||
complete -c ocrmypdf -x -l tesseract-pagesegmode -d "set tesseract --psm"
|
||||
complete -c ocrmypdf -x -l tesseract-oem -d "set tesseract --oem"
|
||||
|
||||
function __fish_ocrmypdf_tesseract_pagesegmode
|
||||
echo -e "0\t"(_ "orientation and script detection (OSD) only")
|
||||
echo -e "1\t"(_ "automatic page segmentation with OSD")
|
||||
echo -e "2\t"(_ "automatic page segmentation, but no OSD, or OCR")
|
||||
echo -e "3\t"(_ "fully automatic page segmentation, but no OSD (default)")
|
||||
echo -e "4\t"(_ "assume a single column of text of variable sizes")
|
||||
echo -e "5\t"(_ "assume a single uniform block of vertically aligned text")
|
||||
echo -e "6\t"(_ "assume a single uniform block of text")
|
||||
echo -e "7\t"(_ "treat the image as a single text line")
|
||||
echo -e "8\t"(_ "treat the image as a single word")
|
||||
echo -e "9\t"(_ "treat the image as a single word in a circle")
|
||||
echo -e "10\t"(_ "treat the image as a single character")
|
||||
echo -e "11\t"(_ "sparse text - find as much text as possible in no particular order")
|
||||
echo -e "12\t"(_ "sparse text with OSD")
|
||||
echo -e "13\t"(_ "raw line - treat the image as a single text line")
|
||||
end
|
||||
complete -c ocrmypdf -x -l tesseract-pagesegmode -a '(__fish_ocrmypdf_tesseract_pagesegmode)' -d "set tesseract --psm"
|
||||
|
||||
function __fish_ocrmypdf_tesseract_oem
|
||||
echo -e "0\t"(_ "legacy engine only")
|
||||
echo -e "1\t"(_ "neural nets LSTM engine only")
|
||||
echo -e "2\t"(_ "legacy + LSTM engines")
|
||||
echo -e "3\t"(_ "default, based on what is available")
|
||||
end
|
||||
complete -c ocrmypdf -x -l tesseract-oem -a '(__fish_ocrmypdf_tesseract_oem)' -d "set tesseract --oem"
|
||||
complete -c ocrmypdf -x -l tesseract-timeout -d "maximum number of seconds to wait for OCR"
|
||||
complete -c ocrmypdf -x -l rotate-pages-threshold -d "page rotation confidence"
|
||||
complete -c ocrmypdf -x -l pdfa-image-compression -a 'auto jpeg lossless' -d "set PDF/A image compression options"
|
||||
|
||||
complete -c ocrmypdf -x -a "(__fish_complete_suffix .pdf)"
|
||||
complete -c ocrmypdf -r -l user-words -d "specify location of user words file"
|
||||
complete -c ocrmypdf -r -l user-patterns -d "specify location of user patterns file"
|
||||
complete -c ocrmypdf -x -l fast-web-view -d "if file size if above this amount in MB, linearize PDF"
|
||||
|
||||
complete -c ocrmypdf -x -a "(__fish_complete_suffix .pdf; __fish_complete_suffix .PDF; __fish_complete_suffix .jpg; __fish_complete_suffix .png)"
|
||||
|
||||
15
misc/docker-compose.example.yml
Normal file
15
misc/docker-compose.example.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
---
|
||||
version: "3.3"
|
||||
services:
|
||||
ocrmypdf:
|
||||
restart: always
|
||||
container_name: ocrmypdf
|
||||
image: jbarlow83/ocrmypdf
|
||||
volumes:
|
||||
- "/media/scan:/input"
|
||||
- "/mnt/scan:/output"
|
||||
environment:
|
||||
- OCR_OUTPUT_DIRECTORY_YEAR_MONTH=0
|
||||
user: "<SET TO YOUR USER ID>:<SET TO YOUR GROUP ID>"
|
||||
entrypoint: python3
|
||||
command: watcher.py
|
||||
84
misc/example_plugin.py
Normal file
84
misc/example_plugin.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# © 2020 James R Barlow: https://github.com/jbarlow83
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
"""
|
||||
An example of an OCRmyPDF plugin.
|
||||
|
||||
This plugin adds two new command line arguments
|
||||
--grayscale-ocr: converts the image to grayscale before performing OCR on it
|
||||
(This is occasionally useful for images whose color confounds OCR. It only
|
||||
affects the image shown to OCR. The image is not saved.)
|
||||
--mono-page: converts pages all pages in the output file to black and white
|
||||
|
||||
To use this from the command line:
|
||||
ocrmypdf --plugin path/to/example_plugin.py --mono-page input.pdf output.pdf
|
||||
|
||||
To use this as an API:
|
||||
import ocrmypdf
|
||||
ocrmypdf.ocr('input.pdf', 'output.pdf',
|
||||
plugins=['path/to/example_plugin.py'], mono_page=True
|
||||
)
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ocrmypdf import hookimpl
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def add_options(parser):
|
||||
parser.add_argument('--grayscale-ocr', action='store_true')
|
||||
parser.add_argument('--mono-page', action='store_true')
|
||||
|
||||
|
||||
@hookimpl
|
||||
def prepare(options):
|
||||
pass
|
||||
|
||||
|
||||
@hookimpl
|
||||
def validate(pdfinfo, options):
|
||||
pass
|
||||
|
||||
|
||||
@hookimpl
|
||||
def filter_ocr_image(page, image):
|
||||
if page.options.grayscale_ocr:
|
||||
log.info("graying")
|
||||
return image.convert('L')
|
||||
return image
|
||||
|
||||
|
||||
@hookimpl
|
||||
def filter_page_image(page, image_filename):
|
||||
if page.options.mono_page:
|
||||
with Image.open(image_filename) as im:
|
||||
im = im.convert('1')
|
||||
im.save(image_filename)
|
||||
return image_filename
|
||||
else:
|
||||
output = image_filename.with_suffix('.jpg')
|
||||
with Image.open(image_filename) as im:
|
||||
im.save(output)
|
||||
return output
|
||||
92
misc/synology.py
Normal file
92
misc/synology.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/bin/env python3
|
||||
# Copyright 2017 github.com/Enantiomerie
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
# This script must be edited to meet your needs.
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
# pylint: disable=logging-format-interpolation
|
||||
# pylint: disable=logging-not-lazy
|
||||
|
||||
script_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
timestamp = time.strftime("%Y-%m-%d-%H%M_")
|
||||
log_file = script_dir + '/' + timestamp + 'ocrmypdf.log'
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s %(message)s',
|
||||
filename=log_file,
|
||||
filemode='w',
|
||||
)
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
start_dir = sys.argv[1]
|
||||
else:
|
||||
start_dir = '.'
|
||||
|
||||
for dir_name, subdirs, file_list in os.walk(start_dir):
|
||||
logging.info(dir_name)
|
||||
os.chdir(dir_name)
|
||||
for filename in file_list:
|
||||
file_stem, file_ext = os.path.splitext(filename)
|
||||
if file_ext != '.pdf':
|
||||
continue
|
||||
full_path = os.path.join(dir_name, filename)
|
||||
timestamp_ocr = time.strftime("%Y-%m-%d-%H%M_OCR_")
|
||||
filename_ocr = timestamp_ocr + file_stem + '.pdf'
|
||||
# create string for pdf processing
|
||||
# the script is processed as root user via chron
|
||||
cmd = [
|
||||
'docker',
|
||||
'run',
|
||||
'--rm',
|
||||
'-i',
|
||||
'jbarlow83/ocrmypdf',
|
||||
'--deskew',
|
||||
'-',
|
||||
'-',
|
||||
]
|
||||
logging.info(cmd)
|
||||
full_path_ocr = os.path.join(dir_name, filename_ocr)
|
||||
with open(filename, 'rb') as input_file, open(
|
||||
full_path_ocr, 'wb'
|
||||
) as output_file:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
stdin=input_file,
|
||||
stdout=output_file,
|
||||
stderr=subprocess.PIPE,
|
||||
check=False,
|
||||
text=True,
|
||||
errors='ignore',
|
||||
)
|
||||
logging.info(proc.stderr)
|
||||
os.chmod(full_path_ocr, 0o664)
|
||||
os.chmod(full_path, 0o664)
|
||||
full_path_ocr_archive = sys.argv[2]
|
||||
full_path_archive = sys.argv[2] + '/no_ocr'
|
||||
shutil.move(full_path_ocr, full_path_ocr_archive)
|
||||
shutil.move(full_path, full_path_archive)
|
||||
logging.info('Finished.\n')
|
||||
166
misc/watcher.py
Normal file
166
misc/watcher.py
Normal file
@@ -0,0 +1,166 @@
|
||||
# Copyright (C) 2019 Ian Alexander: https://github.com/ianalexander
|
||||
# Copyright (C) 2020 James R Barlow: https://github.com/jbarlow83
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import pikepdf
|
||||
from watchdog.events import PatternMatchingEventHandler
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.observers.polling import PollingObserver
|
||||
|
||||
import ocrmypdf
|
||||
|
||||
# pylint: disable=logging-format-interpolation
|
||||
|
||||
INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
|
||||
OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
|
||||
OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', ''))
|
||||
ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', ''))
|
||||
DESKEW = bool(os.getenv('OCR_DESKEW', ''))
|
||||
OCR_JSON_SETTINGS = json.loads(os.getenv('OCR_JSON_SETTINGS', '{}'))
|
||||
POLL_NEW_FILE_SECONDS = int(os.getenv('OCR_POLL_NEW_FILE_SECONDS', '1'))
|
||||
USE_POLLING = bool(os.getenv('OCR_USE_POLLING', ''))
|
||||
LOGLEVEL = os.getenv('OCR_LOGLEVEL', 'INFO')
|
||||
PATTERNS = ['*.pdf', '*.PDF']
|
||||
|
||||
log = logging.getLogger('ocrmypdf-watcher')
|
||||
|
||||
|
||||
def get_output_dir(root, basename):
|
||||
if OUTPUT_DIRECTORY_YEAR_MONTH:
|
||||
today = datetime.today()
|
||||
output_directory_year_month = (
|
||||
Path(root) / str(today.year) / f'{today.month:02d}'
|
||||
)
|
||||
if not output_directory_year_month.exists():
|
||||
output_directory_year_month.mkdir(parents=True, exist_ok=True)
|
||||
output_path = Path(output_directory_year_month) / basename
|
||||
else:
|
||||
output_path = Path(OUTPUT_DIRECTORY) / basename
|
||||
return output_path
|
||||
|
||||
|
||||
def wait_for_file_ready(file_path):
|
||||
# This loop waits to make sure that the file is completely loaded on
|
||||
# disk before attempting to read. Docker sometimes will publish the
|
||||
# watchdog event before the file is actually fully on disk, causing
|
||||
# pikepdf to fail.
|
||||
|
||||
retries = 5
|
||||
while retries:
|
||||
try:
|
||||
pdf = pikepdf.open(file_path)
|
||||
except (FileNotFoundError, pikepdf.PdfError) as e:
|
||||
log.info(f"File {file_path} is not ready yet")
|
||||
log.debug("Exception was", exc_info=e)
|
||||
time.sleep(POLL_NEW_FILE_SECONDS)
|
||||
retries -= 1
|
||||
else:
|
||||
pdf.close()
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def execute_ocrmypdf(file_path):
|
||||
file_path = Path(file_path)
|
||||
output_path = get_output_dir(OUTPUT_DIRECTORY, file_path.name)
|
||||
|
||||
log.info("-" * 20)
|
||||
log.info(f'New file: {file_path}. Waiting until fully loaded...')
|
||||
if not wait_for_file_ready(file_path):
|
||||
log.info(f"Gave up waiting for {file_path} to become ready")
|
||||
return
|
||||
log.info(f'Attempting to OCRmyPDF to: {output_path}')
|
||||
exit_code = ocrmypdf.ocr(
|
||||
input_file=file_path,
|
||||
output_file=output_path,
|
||||
deskew=DESKEW,
|
||||
**OCR_JSON_SETTINGS,
|
||||
)
|
||||
if exit_code == 0 and ON_SUCCESS_DELETE:
|
||||
log.info(f'OCR is done. Deleting: {file_path}')
|
||||
file_path.unlink()
|
||||
else:
|
||||
log.info('OCR is done')
|
||||
|
||||
|
||||
class HandleObserverEvent(PatternMatchingEventHandler):
|
||||
def on_any_event(self, event):
|
||||
if event.event_type in ['created']:
|
||||
execute_ocrmypdf(event.src_path)
|
||||
|
||||
|
||||
def main():
|
||||
ocrmypdf.configure_logging(
|
||||
verbosity=(
|
||||
ocrmypdf.Verbosity.default
|
||||
if LOGLEVEL != 'DEBUG'
|
||||
else ocrmypdf.Verbosity.debug
|
||||
),
|
||||
manage_root_logger=True,
|
||||
)
|
||||
log.setLevel(LOGLEVEL)
|
||||
log.info(
|
||||
f"Starting OCRmyPDF watcher with config:\n"
|
||||
f"Input Directory: {INPUT_DIRECTORY}\n"
|
||||
f"Output Directory: {OUTPUT_DIRECTORY}\n"
|
||||
f"Output Directory Year & Month: {OUTPUT_DIRECTORY_YEAR_MONTH}"
|
||||
)
|
||||
log.debug(
|
||||
f"INPUT_DIRECTORY: {INPUT_DIRECTORY}\n"
|
||||
f"OUTPUT_DIRECTORY: {OUTPUT_DIRECTORY}\n"
|
||||
f"OUTPUT_DIRECTORY_YEAR_MONTH: {OUTPUT_DIRECTORY_YEAR_MONTH}\n"
|
||||
f"ON_SUCCESS_DELETE: {ON_SUCCESS_DELETE}\n"
|
||||
f"DESKEW: {DESKEW}\n"
|
||||
f"ARGS: {OCR_JSON_SETTINGS}\n"
|
||||
f"POLL_NEW_FILE_SECONDS: {POLL_NEW_FILE_SECONDS}\n"
|
||||
f"USE_POLLING: {USE_POLLING}\n"
|
||||
f"LOGLEVEL: {LOGLEVEL}"
|
||||
)
|
||||
|
||||
if 'input_file' in OCR_JSON_SETTINGS or 'output_file' in OCR_JSON_SETTINGS:
|
||||
log.error('OCR_JSON_SETTINGS should not specify input file or output file')
|
||||
sys.exit(1)
|
||||
|
||||
handler = HandleObserverEvent(patterns=PATTERNS)
|
||||
if USE_POLLING:
|
||||
observer = PollingObserver()
|
||||
else:
|
||||
observer = Observer()
|
||||
observer.schedule(handler, INPUT_DIRECTORY, recursive=True)
|
||||
observer.start()
|
||||
try:
|
||||
while True:
|
||||
time.sleep(1)
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
observer.join()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -23,21 +23,22 @@ to emphasize that SaaS deployments should make sure they comply with
|
||||
Ghostscript's license as well as OCRmyPDF's.
|
||||
"""
|
||||
|
||||
import os
|
||||
import shlex
|
||||
from subprocess import PIPE, run
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from flask import (
|
||||
Flask,
|
||||
Response,
|
||||
flash,
|
||||
request,
|
||||
redirect,
|
||||
url_for,
|
||||
abort,
|
||||
flash,
|
||||
redirect,
|
||||
request,
|
||||
send_from_directory,
|
||||
url_for,
|
||||
)
|
||||
from subprocess import run, PIPE
|
||||
from tempfile import TemporaryDirectory
|
||||
from werkzeug.utils import secure_filename
|
||||
import os
|
||||
import shlex
|
||||
|
||||
app = Flask(__name__)
|
||||
app.secret_key = "secret"
|
||||
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.black]
|
||||
line-length = 88
|
||||
py36 = true
|
||||
target-version = ["py36", "py37", "py38"]
|
||||
skip-string-normalization = true
|
||||
include = '\.pyi?$'
|
||||
exclude = '''
|
||||
@@ -28,5 +28,6 @@ exclude = '''
|
||||
| docs
|
||||
| misc
|
||||
| \.egg-info
|
||||
| src/ocrmypdf/lib/_leptonica.py
|
||||
)/
|
||||
'''
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
check-manifest >= 0.35
|
||||
twine >= 1.8.1
|
||||
coverage >= 4.5
|
||||
GitPython == 2.1.3
|
||||
@@ -1,13 +1,12 @@
|
||||
# requirements.txt can be used to replicate the developer's build environment
|
||||
# setup.py lists a separate set of requirements that are looser to simplify
|
||||
# installation
|
||||
chardet == 3.0.4
|
||||
cffi == 1.12.2
|
||||
img2pdf == 0.3.3
|
||||
pdfminer.six == 20181108
|
||||
pikepdf == 1.3.0
|
||||
Pillow >= 5.0.0, != 5.1.0 ; sys_platform == "darwin"
|
||||
pycparser == 2.19
|
||||
python-xmp-toolkit == 2.0.1
|
||||
reportlab == 3.5.13
|
||||
ruffus == 2.8.1
|
||||
cffi == 1.14.5
|
||||
coloredlogs == 15.0 # technically optional
|
||||
img2pdf == 0.4.0
|
||||
pdfminer.six == 20201018
|
||||
pikepdf == 2.10.0
|
||||
pluggy == 0.13.1
|
||||
Pillow == 8.1.2
|
||||
reportlab == 3.5.66
|
||||
tqdm == 4.59.0
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
pytest >= 4.4.1, < 5
|
||||
pytest-helpers-namespace >= 2019.1.8
|
||||
pytest-xdist == 1.28.0
|
||||
pytest-cov >= 2.6.1
|
||||
python-xmp-toolkit # requires apt-get install libexempi3
|
||||
pytest >= 6.0.0
|
||||
pytest-xdist >= 2.2.0
|
||||
pytest-cov >= 2.11.1
|
||||
python-xmp-toolkit == 2.0.1 # requires apt-get install libexempi3
|
||||
# or brew install exempi
|
||||
PyPDF2 >= 1.26.0
|
||||
#PyMuPDF == 1.13.4 # optional
|
||||
|
||||
1
requirements/watcher.txt
Normal file
1
requirements/watcher.txt
Normal file
@@ -0,0 +1 @@
|
||||
watchdog == 1.0.2
|
||||
1
requirements/webservice.txt
Normal file
1
requirements/webservice.txt
Normal file
@@ -0,0 +1 @@
|
||||
Flask >= 1, < 2
|
||||
33
setup.cfg
33
setup.cfg
@@ -1,5 +1,5 @@
|
||||
[bdist_wheel]
|
||||
python-tag = py35
|
||||
python-tag = py36
|
||||
|
||||
[aliases]
|
||||
test=pytest
|
||||
@@ -13,6 +13,10 @@ norecursedirs = lib .pc .git output cache resources
|
||||
testpaths = tests
|
||||
filterwarnings =
|
||||
ignore:.*XMLParser.*:DeprecationWarning
|
||||
markers =
|
||||
slow
|
||||
addopts =
|
||||
-n auto
|
||||
|
||||
[isort]
|
||||
multi_line_output=3
|
||||
@@ -20,6 +24,33 @@ include_trailing_comma=True
|
||||
force_grid_wrap=0
|
||||
use_parentheses=True
|
||||
line_length=88
|
||||
known_first_party = ocrmypdf
|
||||
known_third_party = PIL,_cffi_backend,cffi,flask,img2pdf,pdfminer,pikepdf,pkg_resources,pluggy,pytest,reportlab,setuptools,sphinx_rtd_theme,tqdm,watchdog,werkzeug
|
||||
|
||||
[metadata]
|
||||
license_file = LICENSE
|
||||
|
||||
[coverage:paths]
|
||||
source =
|
||||
src/ocrmypdf
|
||||
|
||||
[coverage:run]
|
||||
branch = true
|
||||
parallel = true
|
||||
concurrency = multiprocessing
|
||||
|
||||
[coverage:report]
|
||||
# Regexes for lines to exclude from consideration
|
||||
exclude_lines =
|
||||
# Have to re-enable the standard pragma
|
||||
pragma: no cover
|
||||
|
||||
# Don't complain if tests don't hit defensive assertion code:
|
||||
raise AssertionError
|
||||
raise NotImplementedError
|
||||
|
||||
# Don't complain if non-runnable code isn't run:
|
||||
if 0:
|
||||
if False:
|
||||
if __name__ == .__main__.:
|
||||
if TYPE_CHECKING:
|
||||
|
||||
75
setup.py
75
setup.py
@@ -2,53 +2,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# © 2015 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import sys
|
||||
|
||||
from setuptools import find_packages, setup
|
||||
|
||||
if sys.version_info < (3, 6):
|
||||
print("Python 3.6 or newer is required", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
from setuptools import setup, find_packages
|
||||
from subprocess import STDOUT, check_output, CalledProcessError
|
||||
from collections.abc import Mapping
|
||||
import re
|
||||
|
||||
# pylint: disable=w0613
|
||||
|
||||
|
||||
command = next((arg for arg in sys.argv[1:] if not arg.startswith('-')), '')
|
||||
if command.startswith('install') or command in [
|
||||
'check',
|
||||
'test',
|
||||
'nosetests',
|
||||
'easy_install',
|
||||
]:
|
||||
forced = '--force' in sys.argv
|
||||
if forced:
|
||||
print("The argument --force is deprecated. Please discontinue use.")
|
||||
|
||||
|
||||
if 'upload' in sys.argv[1:]:
|
||||
print('Use twine to upload the package - setup.py upload is insecure')
|
||||
sys.exit(1)
|
||||
|
||||
tests_require = open('requirements/test.txt', encoding='utf-8').read().splitlines()
|
||||
|
||||
|
||||
@@ -64,20 +32,23 @@ setup(
|
||||
long_description_content_type='text/markdown',
|
||||
url='https://github.com/jbarlow83/OCRmyPDF',
|
||||
author='James R. Barlow',
|
||||
author_email='jim@purplerock.ca',
|
||||
author_email='james@purplerock.ca',
|
||||
packages=find_packages('src', exclude=["tests", "tests.*"]),
|
||||
package_dir={'': 'src'},
|
||||
keywords=['PDF', 'OCR', 'optical character recognition', 'PDF/A', 'scanning'],
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: System Administrators",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
|
||||
"Operating System :: MacOS :: MacOS X",
|
||||
"Operating System :: Microsoft :: Windows :: Windows 10",
|
||||
"Operating System :: POSIX",
|
||||
"Operating System :: POSIX :: BSD",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
@@ -88,28 +59,26 @@ setup(
|
||||
python_requires=' >= 3.6',
|
||||
setup_requires=[ # can be removed whenever we can drop pip 9 support
|
||||
'cffi >= 1.9.1', # to build the leptonica module
|
||||
'pytest-runner', # to enable python setup.py test
|
||||
'setuptools_scm', # so that version will work
|
||||
'setuptools_scm_git_archive', # enable version from github tarballs
|
||||
],
|
||||
use_scm_version={'version_scheme': 'post-release'},
|
||||
cffi_modules=['src/ocrmypdf/lib/compile_leptonica.py:ffibuilder'],
|
||||
install_requires=[
|
||||
'chardet >= 3.0.4, < 4', # unlisted requirement of pdfminer.six 20181108
|
||||
'cffi >= 1.9.1', # must be a setup and install requirement
|
||||
'img2pdf >= 0.3.0, < 0.4', # pure Python, so track HEAD closely
|
||||
'pdfminer.six == 20181108 ; sys_platform != "darwin"',
|
||||
'pikepdf >= 1.3.0, < 2',
|
||||
'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"',
|
||||
# Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3
|
||||
# block 5.1.0, broken wheels
|
||||
'reportlab >= 3.3.0', # oldest released version with sane image handling
|
||||
'ruffus >= 2.7.0',
|
||||
'coloredlogs >= 14.0', # strictly optional
|
||||
'img2pdf >= 0.3.0, < 0.5', # pure Python, so track HEAD closely
|
||||
'pdfminer.six >= 20191110, != 20200720, <= 20201018',
|
||||
"pikepdf >= 2.10.0",
|
||||
'Pillow >= 8.1.2',
|
||||
'pluggy >= 0.13.0, < 1.0',
|
||||
'reportlab >= 3.5.66',
|
||||
'setuptools',
|
||||
'tqdm >= 4',
|
||||
],
|
||||
extras_require={'pdfminer': ['pdfminer.six == 20181108']},
|
||||
tests_require=tests_require,
|
||||
entry_points={'console_scripts': ['ocrmypdf = ocrmypdf.__main__:run_pipeline']},
|
||||
package_data={'ocrmypdf': ['data/sRGB.icc']},
|
||||
entry_points={'console_scripts': ['ocrmypdf = ocrmypdf.__main__:run']},
|
||||
package_data={'ocrmypdf': ['data/sRGB.icc', 'py.typed']},
|
||||
include_package_data=True,
|
||||
zip_safe=False,
|
||||
project_urls={
|
||||
|
||||
35
src/ocrmypdf/RELEASE.md
Normal file
35
src/ocrmypdf/RELEASE.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Release checklist
|
||||
|
||||
## Patch release
|
||||
|
||||
- Check `pytest`
|
||||
|
||||
- Update release notes
|
||||
|
||||
## Minor release
|
||||
|
||||
## Major release
|
||||
|
||||
- Run `pre-commit autoupdate`
|
||||
|
||||
- Check README.md
|
||||
|
||||
- Check setup.py
|
||||
|
||||
- Are classifiers up to date?
|
||||
- Is `python_requires` correct?
|
||||
- Python 3.6 is EOL on December 2021-12. Could drop support then.
|
||||
- Can we tighten any `install_requires` dependencies?
|
||||
|
||||
- Search for old version shims we can remove
|
||||
|
||||
- "shim"
|
||||
- ` pikepdf.__version__`
|
||||
|
||||
- Search for deprecation: search all files for deprec*, etc.
|
||||
|
||||
- Check requirements/*
|
||||
|
||||
- Delete `tests/cache`, do `pytest --runslow`, and update cache.
|
||||
|
||||
- Do `pytest --cov-report html`
|
||||
@@ -1,46 +1,32 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import pkg_resources
|
||||
|
||||
PROGRAM_NAME = 'ocrmypdf'
|
||||
from pluggy import HookimplMarker as _HookimplMarker
|
||||
|
||||
# Official PEP 396
|
||||
__version__ = pkg_resources.get_distribution('ocrmypdf').version
|
||||
|
||||
VERSION = __version__
|
||||
|
||||
from .exceptions import (
|
||||
ExitCode,
|
||||
from ocrmypdf import helpers, hocrtransform, leptonica, pdfa, pdfinfo
|
||||
from ocrmypdf._concurrent import Executor
|
||||
from ocrmypdf._jobcontext import PageContext, PdfContext
|
||||
from ocrmypdf._version import PROGRAM_NAME, __version__
|
||||
from ocrmypdf.api import Verbosity, configure_logging, ocr
|
||||
from ocrmypdf.exceptions import (
|
||||
BadArgsError,
|
||||
PdfMergeFailedError,
|
||||
MissingDependencyError,
|
||||
UnsupportedImageFormatError,
|
||||
DpiError,
|
||||
OutputFileAccessError,
|
||||
PriorOcrFoundError,
|
||||
InputFileError,
|
||||
SubprocessOutputError,
|
||||
EncryptedPdfError,
|
||||
ExitCode,
|
||||
ExitCodeException,
|
||||
InputFileError,
|
||||
MissingDependencyError,
|
||||
OutputFileAccessError,
|
||||
PdfMergeFailedError,
|
||||
PriorOcrFoundError,
|
||||
SubprocessOutputError,
|
||||
TesseractConfigError,
|
||||
UnsupportedImageFormatError,
|
||||
)
|
||||
from ocrmypdf.pluginspec import OcrEngine, OrientationConfidence
|
||||
|
||||
from . import helpers
|
||||
from . import hocrtransform
|
||||
from . import leptonica
|
||||
from . import pdfa
|
||||
from . import pdfinfo
|
||||
hookimpl = _HookimplMarker('ocrmypdf')
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
133
src/ocrmypdf/_concurrent.py
Normal file
133
src/ocrmypdf/_concurrent.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import threading
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, Iterable, Optional
|
||||
|
||||
|
||||
def _task_noop(*_args, **_kwargs):
|
||||
return
|
||||
|
||||
|
||||
class NullProgressBar:
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
return False
|
||||
|
||||
def update(self, _arg=None):
|
||||
return
|
||||
|
||||
|
||||
class Executor(ABC):
|
||||
pool_lock = threading.Lock()
|
||||
pbar_class = NullProgressBar
|
||||
|
||||
def __init__(self, *, pbar_class=None):
|
||||
if pbar_class:
|
||||
self.pbar_class = pbar_class
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
*,
|
||||
use_threads: bool,
|
||||
max_workers: int,
|
||||
tqdm_kwargs: dict,
|
||||
worker_initializer: Optional[Callable] = None,
|
||||
task: Optional[Callable] = None,
|
||||
task_arguments: Optional[Iterable] = None,
|
||||
task_finished: Optional[Callable] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Set up parallel execution and progress reporting.
|
||||
|
||||
Args:
|
||||
use_threads: If ``False``, the workload is the sort that will benefit from
|
||||
running in a multiprocessing context (for example, it uses Python
|
||||
heavily, and parallelizing it with threads is not expected to be
|
||||
performant).
|
||||
max_workers: The maximum number of workers that should be run.
|
||||
tdqm_kwargs: Arguments to set up the progress bar.
|
||||
worker_initializer: Called when a worker is initialized, in the worker's
|
||||
execution context. If the child workers are processes, it must be
|
||||
possible to marshall/pickle the worker initializer.
|
||||
``functools.partial`` can be used to bind parameters.
|
||||
task: Called when the worker starts a new task, in the worker's execution
|
||||
context. Must be possible to marshall to the worker.
|
||||
task_finished: Called when a worker finishes a task, in the parent's
|
||||
context.
|
||||
task_arguments: An iterable that generates a group of parameters for each
|
||||
task. This runs in the parent's context, but the parameters must be
|
||||
marshallable to the worker.
|
||||
"""
|
||||
|
||||
if not task_arguments:
|
||||
return # Nothing to do!
|
||||
if not worker_initializer:
|
||||
worker_initializer = _task_noop
|
||||
if not task_finished:
|
||||
task_finished = _task_noop
|
||||
if not task:
|
||||
task = _task_noop
|
||||
|
||||
with self.pool_lock:
|
||||
self._execute(
|
||||
use_threads=use_threads,
|
||||
max_workers=max_workers,
|
||||
tqdm_kwargs=tqdm_kwargs,
|
||||
worker_initializer=worker_initializer,
|
||||
task=task,
|
||||
task_arguments=task_arguments,
|
||||
task_finished=task_finished,
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def _execute(
|
||||
self,
|
||||
*,
|
||||
use_threads: bool,
|
||||
max_workers: int,
|
||||
tqdm_kwargs: dict,
|
||||
worker_initializer: Callable,
|
||||
task: Callable,
|
||||
task_arguments: Iterable,
|
||||
task_finished: Callable,
|
||||
):
|
||||
"""Custom executors should override this method."""
|
||||
|
||||
|
||||
def setup_executor(plugin_manager) -> Executor:
|
||||
pbar_class = plugin_manager.hook.get_progressbar_class()
|
||||
return plugin_manager.hook.get_executor(progressbar_class=pbar_class)
|
||||
|
||||
|
||||
class SerialExecutor(Executor):
|
||||
"""Implements a purely sequential executor using the parallel protocol.
|
||||
|
||||
The current process/thread will be the worker that executes all tasks
|
||||
in order. As such, ``worker_initializer`` will never be called.
|
||||
"""
|
||||
|
||||
def _execute(
|
||||
self,
|
||||
*,
|
||||
use_threads: bool,
|
||||
max_workers: int,
|
||||
tqdm_kwargs: dict,
|
||||
worker_initializer: Callable,
|
||||
task: Callable,
|
||||
task_arguments: Iterable,
|
||||
task_finished: Callable,
|
||||
): # pylint: disable=unused-argument
|
||||
with self.pbar_class(**tqdm_kwargs) as pbar:
|
||||
for args in task_arguments:
|
||||
result = task(args)
|
||||
task_finished(result, pbar)
|
||||
8
src/ocrmypdf/_exec/__init__.py
Normal file
8
src/ocrmypdf/_exec/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Manage third party executables"""
|
||||
270
src/ocrmypdf/_exec/ghostscript.py
Normal file
270
src/ocrmypdf/_exec/ghostscript.py
Normal file
@@ -0,0 +1,270 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Interface to Ghostscript executable"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from io import BytesIO
|
||||
from os import fspath
|
||||
from pathlib import Path
|
||||
from shutil import which
|
||||
from subprocess import PIPE, CalledProcessError
|
||||
from typing import Optional
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
|
||||
from ocrmypdf.helpers import Resolution
|
||||
from ocrmypdf.subprocess import get_version, run, run_polling_stderr
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
missing_gs_error = """
|
||||
---------------------------------------------------------------------
|
||||
This error normally occurs when ocrmypdf find can't Ghostscript.
|
||||
Please ensure Ghostscript is installed and its location is added to
|
||||
the system PATH environment variable.
|
||||
|
||||
For details see:
|
||||
https://ocrmypdf.readthedocs.io/en/latest/installation.html
|
||||
---------------------------------------------------------------------
|
||||
"""
|
||||
|
||||
_gswin = None
|
||||
if os.name == 'nt':
|
||||
_gswin = which('gswin64c')
|
||||
if not _gswin:
|
||||
_gswin = which('gswin32c')
|
||||
if not _gswin:
|
||||
raise MissingDependencyError(missing_gs_error)
|
||||
_gswin = Path(_gswin).stem
|
||||
|
||||
GS = _gswin if _gswin else 'gs'
|
||||
del _gswin
|
||||
|
||||
|
||||
def version():
|
||||
return get_version(GS)
|
||||
|
||||
|
||||
def jpeg_passthrough_available() -> bool:
|
||||
"""Returns True if the installed version of Ghostscript supports JPEG passthru
|
||||
|
||||
Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23
|
||||
it gained the ability to keep JPEGs unmodified. However, the 9.23
|
||||
implementation was buggy and would deletes the last two bytes of images in
|
||||
some cases, as reported here.
|
||||
https://bugs.ghostscript.com/show_bug.cgi?id=699216
|
||||
|
||||
The issue was fixed for 9.24, hence that is the first version we consider
|
||||
the feature available. (Ghostscript 9.24 has its own problems is blacklisted.)
|
||||
"""
|
||||
return version() >= '9.24'
|
||||
|
||||
|
||||
def _gs_error_reported(stream) -> bool:
|
||||
return True if re.search(r'error', stream, flags=re.IGNORECASE) else False
|
||||
|
||||
|
||||
def rasterize_pdf(
|
||||
input_file: os.PathLike,
|
||||
output_file: os.PathLike,
|
||||
*,
|
||||
raster_device: str,
|
||||
raster_dpi: Resolution,
|
||||
pageno: int = 1,
|
||||
page_dpi: Optional[Resolution] = None,
|
||||
rotation: Optional[int] = None,
|
||||
filter_vector: bool = False,
|
||||
):
|
||||
"""Rasterize one page of a PDF at resolution raster_dpi in canvas units."""
|
||||
raster_dpi = raster_dpi.round(6)
|
||||
if not page_dpi:
|
||||
page_dpi = raster_dpi
|
||||
|
||||
args_gs = (
|
||||
[
|
||||
GS,
|
||||
'-dQUIET',
|
||||
'-dSAFER',
|
||||
'-dBATCH',
|
||||
'-dNOPAUSE',
|
||||
f'-sDEVICE={raster_device}',
|
||||
f'-dFirstPage={pageno}',
|
||||
f'-dLastPage={pageno}',
|
||||
f'-r{raster_dpi.x:f}x{raster_dpi.y:f}',
|
||||
]
|
||||
+ (['-dFILTERVECTOR'] if filter_vector else [])
|
||||
+ [
|
||||
'-o',
|
||||
'-',
|
||||
'-sstdout=%stderr',
|
||||
'-dAutoRotatePages=/None', # Probably has no effect on raster
|
||||
'-f',
|
||||
fspath(input_file),
|
||||
]
|
||||
)
|
||||
|
||||
try:
|
||||
p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True)
|
||||
except CalledProcessError as e:
|
||||
log.error(e.stderr.decode(errors='replace'))
|
||||
raise SubprocessOutputError('Ghostscript rasterizing failed')
|
||||
else:
|
||||
stderr = p.stderr.decode(errors='replace')
|
||||
if _gs_error_reported(stderr):
|
||||
log.error(stderr)
|
||||
|
||||
with Image.open(BytesIO(p.stdout)) as im:
|
||||
if rotation is not None:
|
||||
log.debug("Rotating output by %i", rotation)
|
||||
# rotation is a clockwise angle and Image.ROTATE_* is
|
||||
# counterclockwise so this cancels out the rotation
|
||||
if rotation == 90:
|
||||
im = im.transpose(Image.ROTATE_90)
|
||||
elif rotation == 180:
|
||||
im = im.transpose(Image.ROTATE_180)
|
||||
elif rotation == 270:
|
||||
im = im.transpose(Image.ROTATE_270)
|
||||
if rotation % 180 == 90:
|
||||
page_dpi = page_dpi.flip_axis()
|
||||
im.save(fspath(output_file), dpi=page_dpi)
|
||||
|
||||
|
||||
class GhostscriptFollower:
|
||||
re_process = re.compile(r"Processing pages \d+ through (\d+).")
|
||||
re_page = re.compile(r"Page (\d+)")
|
||||
|
||||
def __init__(self, progressbar_class):
|
||||
self.count = 0
|
||||
self.progressbar_class = progressbar_class
|
||||
self.progressbar = None
|
||||
|
||||
def __call__(self, line):
|
||||
if not self.progressbar_class:
|
||||
return
|
||||
if not self.progressbar:
|
||||
m = self.re_process.match(line.strip())
|
||||
if m:
|
||||
self.count = int(m.group(1))
|
||||
self.progressbar = self.progressbar_class(
|
||||
total=self.count, desc="PDF/A conversion", unit='page'
|
||||
)
|
||||
return
|
||||
else:
|
||||
m = self.re_page.match(line.strip())
|
||||
if m:
|
||||
self.progressbar.update()
|
||||
|
||||
|
||||
def generate_pdfa(
|
||||
pdf_pages,
|
||||
output_file: os.PathLike,
|
||||
*,
|
||||
compression: str,
|
||||
pdf_version: str = '1.5',
|
||||
pdfa_part: str = '2',
|
||||
progressbar_class=None,
|
||||
):
|
||||
# Ghostscript's compression is all or nothing. We can either force all images
|
||||
# to JPEG, force all to Flate/PNG, or let it decide how to encode the images.
|
||||
# In most case it's best to let it decide.
|
||||
compression_args = []
|
||||
if compression == 'jpeg':
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=false",
|
||||
"-dColorImageFilter=/DCTEncode",
|
||||
"-dAutoFilterGrayImages=false",
|
||||
"-dGrayImageFilter=/DCTEncode",
|
||||
]
|
||||
elif compression == 'lossless':
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=false",
|
||||
"-dColorImageFilter=/FlateEncode",
|
||||
"-dAutoFilterGrayImages=false",
|
||||
"-dGrayImageFilter=/FlateEncode",
|
||||
]
|
||||
else:
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=true",
|
||||
"-dAutoFilterGrayImages=true",
|
||||
]
|
||||
|
||||
strategy = 'LeaveColorUnchanged'
|
||||
# Older versions of Ghostscript expect a leading slash in
|
||||
# sColorConversionStrategy, newer ones should not have it. See Ghostscript
|
||||
# git commit fe1c025d.
|
||||
strategy = ('/' + strategy) if version() < '9.19' else strategy
|
||||
|
||||
if version() == '9.23':
|
||||
# 9.23: added JPEG passthrough as a new feature, but with a bug that
|
||||
# incorrectly formats some images. Fixed as of 9.24. So we disable this
|
||||
# feature for 9.23.
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=699216
|
||||
compression_args.append('-dPassThroughJPEGImages=false')
|
||||
|
||||
# nb no need to specify ProcessColorModel when ColorConversionStrategy
|
||||
# is set; see:
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=699392
|
||||
args_gs = (
|
||||
[
|
||||
GS,
|
||||
"-dBATCH",
|
||||
"-dNOPAUSE",
|
||||
"-dSAFER",
|
||||
"-dCompatibilityLevel=" + str(pdf_version),
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dAutoRotatePages=/None",
|
||||
"-sColorConversionStrategy=" + strategy,
|
||||
]
|
||||
+ compression_args
|
||||
+ [
|
||||
"-dJPEGQ=95",
|
||||
"-dPDFA=" + pdfa_part,
|
||||
"-dPDFACompatibilityPolicy=1",
|
||||
"-o",
|
||||
"-",
|
||||
"-sstdout=%stderr",
|
||||
]
|
||||
)
|
||||
args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs
|
||||
|
||||
try:
|
||||
with Path(output_file).open('wb') as output:
|
||||
p = run_polling_stderr(
|
||||
args_gs,
|
||||
stdout=output,
|
||||
stderr=PIPE,
|
||||
check=True,
|
||||
text=True,
|
||||
encoding='utf-8',
|
||||
errors='replace',
|
||||
callback=GhostscriptFollower(progressbar_class),
|
||||
)
|
||||
except CalledProcessError as e:
|
||||
# Ghostscript does not change return code when it fails to create
|
||||
# PDF/A - check PDF/A status elsewhere
|
||||
log.error(e.stderr)
|
||||
raise SubprocessOutputError('Ghostscript PDF/A rendering failed') from e
|
||||
else:
|
||||
stderr = p.stderr
|
||||
# If there is an error we log the whole stderr, except for filtering
|
||||
# duplicates.
|
||||
if _gs_error_reported(stderr):
|
||||
last_part = None
|
||||
repcount = 0
|
||||
for part in stderr.split('****'):
|
||||
if part != last_part:
|
||||
if repcount > 1:
|
||||
log.error(f"(previous error message repeated {repcount} times)")
|
||||
repcount = 0
|
||||
log.error(part)
|
||||
else:
|
||||
repcount += 1
|
||||
last_part = part
|
||||
@@ -1,28 +1,18 @@
|
||||
# © 2018 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from functools import lru_cache
|
||||
from subprocess import PIPE, run
|
||||
|
||||
from . import get_version
|
||||
from ..exceptions import MissingDependencyError
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Interface to jbig2 executable"""
|
||||
|
||||
from subprocess import PIPE
|
||||
|
||||
from ocrmypdf.exceptions import MissingDependencyError
|
||||
from ocrmypdf.subprocess import get_version, run
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def version():
|
||||
return get_version('jbig2', regex=r'jbig2enc (\d+(\.\d+)*).*')
|
||||
|
||||
@@ -51,9 +41,17 @@ def convert_group(*, cwd, infiles, out_prefix):
|
||||
return proc
|
||||
|
||||
|
||||
def convert_group_mp(args):
|
||||
return convert_group(cwd=args[0], infiles=args[1], out_prefix=args[2])
|
||||
|
||||
|
||||
def convert_single(*, cwd, infile, outfile):
|
||||
args = ['jbig2', '-p', infile]
|
||||
with open(outfile, 'wb') as fstdout:
|
||||
proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE)
|
||||
proc.check_returncode()
|
||||
return proc
|
||||
|
||||
|
||||
def convert_single_mp(args):
|
||||
return convert_single(cwd=args[0], infile=args[1], outfile=args[2])
|
||||
65
src/ocrmypdf/_exec/pngquant.py
Normal file
65
src/ocrmypdf/_exec/pngquant.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# © 2018 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Interface to pngquant executable"""
|
||||
|
||||
from contextlib import contextmanager
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from subprocess import PIPE
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ocrmypdf.exceptions import MissingDependencyError
|
||||
from ocrmypdf.subprocess import get_version, run
|
||||
|
||||
|
||||
def version():
|
||||
return get_version('pngquant', regex=r'(\d+(\.\d+)*).*')
|
||||
|
||||
|
||||
def available():
|
||||
try:
|
||||
version()
|
||||
except MissingDependencyError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@contextmanager
|
||||
def input_as_png(input_file: Path):
|
||||
if not input_file.name.endswith('.png'):
|
||||
with Image.open(input_file) as im:
|
||||
bio = BytesIO()
|
||||
im.save(bio, format='png')
|
||||
bio.seek(0)
|
||||
yield bio
|
||||
else:
|
||||
with open(input_file, 'rb') as f:
|
||||
yield f
|
||||
|
||||
|
||||
def quantize(input_file: Path, output_file: Path, quality_min: int, quality_max: int):
|
||||
with input_as_png(input_file) as input_stream:
|
||||
args = [
|
||||
'pngquant',
|
||||
'--force',
|
||||
'--skip-if-larger',
|
||||
'--quality',
|
||||
f'{quality_min}-{quality_max}',
|
||||
'--', # pngquant: stop processing arguments
|
||||
'-', # pngquant: stream input and output
|
||||
]
|
||||
result = run(args, stdin=input_stream, stdout=PIPE, stderr=PIPE, check=False)
|
||||
|
||||
if result.returncode == 0:
|
||||
# input_file could be the same as output_file, so we defer the write
|
||||
output_file.write_bytes(result.stdout)
|
||||
|
||||
|
||||
def quantize_mp(args):
|
||||
return quantize(*args)
|
||||
342
src/ocrmypdf/_exec/tesseract.py
Normal file
342
src/ocrmypdf/_exec/tesseract.py
Normal file
@@ -0,0 +1,342 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Interface to Tesseract executable"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from collections import namedtuple
|
||||
from distutils.version import StrictVersion
|
||||
from os import fspath
|
||||
from pathlib import Path
|
||||
from subprocess import PIPE, STDOUT, CalledProcessError, TimeoutExpired
|
||||
from typing import List, Optional
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ocrmypdf.exceptions import (
|
||||
MissingDependencyError,
|
||||
SubprocessOutputError,
|
||||
TesseractConfigError,
|
||||
)
|
||||
from ocrmypdf.subprocess import get_version, run
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
OrientationConfidence = namedtuple('OrientationConfidence', ('angle', 'confidence'))
|
||||
|
||||
HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 4.0.0' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
|
||||
</head>
|
||||
<body>
|
||||
<div class='ocr_page' id='page_1' title='image "_blank.png"; bbox 0 0 {0} {1}; ppageno 0'>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
class TesseractLoggerAdapter(logging.LoggerAdapter):
|
||||
def process(self, msg, kwargs):
|
||||
kwargs['extra'] = self.extra
|
||||
return '[tesseract] %s' % (msg), kwargs
|
||||
|
||||
|
||||
class TesseractVersion(StrictVersion):
|
||||
version_re = re.compile(
|
||||
r'''
|
||||
^(\d+) \. (\d+) (\. (\d+))? # groups: 1/major, 2/minor, 3/[skip], 4/patch
|
||||
[-]? # optional hyphen separator
|
||||
(?:(alpha|beta|rc|dev)[.\-\ ]?(\d+)?)? # 5/prerelease, 6/prerelease_num
|
||||
(?:-(\d+)-g[0-9a-f]+)? # untagged git version
|
||||
$
|
||||
''',
|
||||
re.VERBOSE | re.ASCII,
|
||||
)
|
||||
|
||||
def parse(self, vstring):
|
||||
try:
|
||||
super().parse(vstring)
|
||||
except TypeError as e:
|
||||
if 'int() argument must be a string' in str(e):
|
||||
super().parse(vstring + '0')
|
||||
|
||||
|
||||
def version():
|
||||
return get_version('tesseract', regex=r'tesseract\s(.+)')
|
||||
|
||||
|
||||
def has_user_words():
|
||||
"""Does Tesseract have --user-words capability?
|
||||
|
||||
Not available in 4.0, but available in 4.1. Also available in 3.x, but
|
||||
we no longer support 3.x.
|
||||
"""
|
||||
return version() >= '4.1'
|
||||
|
||||
|
||||
def get_languages():
|
||||
def lang_error(output):
|
||||
msg = (
|
||||
"Tesseract failed to report available languages.\n"
|
||||
"Output from Tesseract:\n"
|
||||
"-----------\n"
|
||||
)
|
||||
msg += output
|
||||
return msg
|
||||
|
||||
args_tess = ['tesseract', '--list-langs']
|
||||
try:
|
||||
proc = run(
|
||||
args_tess,
|
||||
text=True,
|
||||
stdout=PIPE,
|
||||
stderr=STDOUT,
|
||||
logs_errors_to_stdout=True,
|
||||
check=True,
|
||||
)
|
||||
output = proc.stdout
|
||||
except CalledProcessError as e:
|
||||
raise MissingDependencyError(lang_error(e.output)) from e
|
||||
|
||||
for line in output.splitlines():
|
||||
if line.startswith('Error'):
|
||||
raise MissingDependencyError(lang_error(output))
|
||||
_header, *rest = output.splitlines()
|
||||
return set(lang.strip() for lang in rest)
|
||||
|
||||
|
||||
def tess_base_args(langs: List[str], engine_mode: Optional[int]) -> List[str]:
|
||||
args = ['tesseract']
|
||||
if langs:
|
||||
args.extend(['-l', '+'.join(langs)])
|
||||
if engine_mode is not None:
|
||||
args.extend(['--oem', str(engine_mode)])
|
||||
return args
|
||||
|
||||
|
||||
def get_orientation(input_file: Path, engine_mode: Optional[int], timeout: float):
|
||||
args_tesseract = tess_base_args(['osd'], engine_mode) + [
|
||||
'--psm',
|
||||
'0',
|
||||
fspath(input_file),
|
||||
'stdout',
|
||||
]
|
||||
|
||||
try:
|
||||
p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
|
||||
stdout = p.stdout
|
||||
except TimeoutExpired:
|
||||
return OrientationConfidence(angle=0, confidence=0.0)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(e.stdout)
|
||||
tesseract_log_output(e.stderr)
|
||||
if (
|
||||
b'Too few characters. Skipping this page' in e.output
|
||||
or b'Image too large' in e.output
|
||||
):
|
||||
return OrientationConfidence(0, 0)
|
||||
raise SubprocessOutputError() from e
|
||||
else:
|
||||
osd = {}
|
||||
for line in stdout.decode().splitlines():
|
||||
line = line.strip()
|
||||
parts = line.split(':', maxsplit=2)
|
||||
if len(parts) == 2:
|
||||
osd[parts[0].strip()] = parts[1].strip()
|
||||
|
||||
angle = int(osd.get('Orientation in degrees', 0))
|
||||
oc = OrientationConfidence(
|
||||
angle=angle, confidence=float(osd.get('Orientation confidence', 0))
|
||||
)
|
||||
return oc
|
||||
|
||||
|
||||
def tesseract_log_output(stream):
|
||||
tlog = TesseractLoggerAdapter(
|
||||
log, extra=log.extra if hasattr(log, 'extra') else None
|
||||
)
|
||||
|
||||
if not stream:
|
||||
return
|
||||
try:
|
||||
text = stream.decode()
|
||||
except UnicodeDecodeError:
|
||||
text = stream.decode('utf-8', 'ignore')
|
||||
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
if line.startswith("Tesseract Open Source"):
|
||||
continue
|
||||
elif line.startswith("Warning in pixReadMem"):
|
||||
continue
|
||||
elif 'diacritics' in line:
|
||||
tlog.warning("lots of diacritics - possibly poor OCR")
|
||||
elif line.startswith('OSD: Weak margin'):
|
||||
tlog.warning("unsure about page orientation")
|
||||
elif 'Error in pixScanForForeground' in line:
|
||||
pass # Appears to be spurious/problem with nonwhite borders
|
||||
elif 'Error in boxClipToRectangle' in line:
|
||||
pass # Always appears with pixScanForForeground message
|
||||
elif 'parameter not found: ' in line.lower():
|
||||
tlog.error(line.strip())
|
||||
problem = line.split('found: ')[1]
|
||||
raise TesseractConfigError(problem)
|
||||
elif 'error' in line.lower() or 'exception' in line.lower():
|
||||
tlog.error(line.strip())
|
||||
elif 'warning' in line.lower():
|
||||
tlog.warning(line.strip())
|
||||
elif 'read_params_file' in line.lower():
|
||||
tlog.error(line.strip())
|
||||
else:
|
||||
tlog.info(line.strip())
|
||||
|
||||
|
||||
def page_timedout(timeout):
|
||||
if timeout == 0:
|
||||
return
|
||||
log.warning("[tesseract] took too long to OCR - skipping")
|
||||
|
||||
|
||||
def _generate_null_hocr(output_hocr, output_text, image):
|
||||
"""Produce a .hocr file that reports no text detected on a page that is
|
||||
the same size as the input image."""
|
||||
with Image.open(image) as im:
|
||||
w, h = im.size
|
||||
|
||||
output_hocr.write_text(HOCR_TEMPLATE.format(w, h), encoding='utf-8')
|
||||
output_text.write_text('[skipped page]', encoding='utf-8')
|
||||
|
||||
|
||||
def generate_hocr(
|
||||
*,
|
||||
input_file: Path,
|
||||
output_hocr: Path,
|
||||
output_text: Path,
|
||||
languages: List[str],
|
||||
engine_mode: int,
|
||||
tessconfig: List[str],
|
||||
timeout: float,
|
||||
pagesegmode: int,
|
||||
user_words,
|
||||
user_patterns,
|
||||
):
|
||||
prefix = output_hocr.with_suffix('')
|
||||
|
||||
args_tesseract = tess_base_args(languages, engine_mode)
|
||||
|
||||
if pagesegmode is not None:
|
||||
args_tesseract.extend(['--psm', str(pagesegmode)])
|
||||
|
||||
if user_words:
|
||||
args_tesseract.extend(['--user-words', user_words])
|
||||
|
||||
if user_patterns:
|
||||
args_tesseract.extend(['--user-patterns', user_patterns])
|
||||
|
||||
# Reminder: test suite tesseract test plugins will break after any changes
|
||||
# to the number of order parameters here
|
||||
args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig)
|
||||
try:
|
||||
p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
|
||||
stdout = p.stdout
|
||||
except TimeoutExpired:
|
||||
# Generate a HOCR file with no recognized text if tesseract times out
|
||||
# Temporary workaround to hocrTransform not being able to function if
|
||||
# it does not have a valid hOCR file.
|
||||
page_timedout(timeout)
|
||||
_generate_null_hocr(output_hocr, output_text, input_file)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(e.output)
|
||||
if b'Image too large' in e.output:
|
||||
_generate_null_hocr(output_hocr, output_text, input_file)
|
||||
return
|
||||
|
||||
raise SubprocessOutputError() from e
|
||||
else:
|
||||
tesseract_log_output(stdout)
|
||||
# The sidecar text file will get the suffix .txt; rename it to
|
||||
# whatever caller wants it named
|
||||
if prefix.with_suffix('.txt').exists():
|
||||
shutil.move(prefix.with_suffix('.txt'), output_text)
|
||||
|
||||
|
||||
def use_skip_page(output_pdf, output_text):
|
||||
output_text.write_text('[skipped page]', encoding='utf-8')
|
||||
|
||||
# A 0 byte file to the output to indicate a skip
|
||||
output_pdf.write_bytes(b'')
|
||||
|
||||
|
||||
def generate_pdf(
|
||||
*,
|
||||
input_file: Path,
|
||||
output_pdf: Path,
|
||||
output_text: Path,
|
||||
languages: List[str],
|
||||
engine_mode: int,
|
||||
tessconfig: List[str],
|
||||
timeout: float,
|
||||
pagesegmode: int,
|
||||
user_words,
|
||||
user_patterns,
|
||||
):
|
||||
"""Use Tesseract to render a PDF.
|
||||
|
||||
input_file -- image to analyze
|
||||
output_pdf -- file to generate
|
||||
output_text -- OCR text file
|
||||
languages -- list of languages to consider
|
||||
engine_mode -- engine mode argument for tess v4
|
||||
tessconfig -- tesseract configuration
|
||||
timeout -- timeout (seconds)
|
||||
"""
|
||||
|
||||
args_tesseract = tess_base_args(languages, engine_mode)
|
||||
|
||||
if pagesegmode is not None:
|
||||
args_tesseract.extend(['--psm', str(pagesegmode)])
|
||||
|
||||
args_tesseract.extend(['-c', 'textonly_pdf=1'])
|
||||
|
||||
if user_words:
|
||||
args_tesseract.extend(['--user-words', user_words])
|
||||
|
||||
if user_patterns:
|
||||
args_tesseract.extend(['--user-patterns', user_patterns])
|
||||
|
||||
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
|
||||
|
||||
# Reminder: test suite tesseract test plugins might break after any changes
|
||||
# to the number of order parameters here
|
||||
|
||||
args_tesseract.extend([input_file, prefix, 'pdf', 'txt'] + tessconfig)
|
||||
try:
|
||||
p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True)
|
||||
stdout = p.stdout
|
||||
if os.path.exists(prefix + '.txt'):
|
||||
shutil.move(prefix + '.txt', output_text)
|
||||
except TimeoutExpired:
|
||||
page_timedout(timeout)
|
||||
use_skip_page(output_pdf, output_text)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(e.output)
|
||||
if b'Image too large' in e.output:
|
||||
use_skip_page(output_pdf, output_text)
|
||||
return
|
||||
raise SubprocessOutputError() from e
|
||||
else:
|
||||
tesseract_log_output(stdout)
|
||||
134
src/ocrmypdf/_exec/unpaper.py
Normal file
134
src/ocrmypdf/_exec/unpaper.py
Normal file
@@ -0,0 +1,134 @@
|
||||
# © 2015 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
# unpaper documentation:
|
||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||
|
||||
"""Interface to unpaper executable"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
from decimal import Decimal
|
||||
from pathlib import Path
|
||||
from subprocess import PIPE, STDOUT
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
|
||||
from ocrmypdf.subprocess import get_version
|
||||
from ocrmypdf.subprocess import run as external_run
|
||||
|
||||
DecFloat = Union[Decimal, float]
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def version() -> str:
|
||||
return get_version('unpaper')
|
||||
|
||||
|
||||
def _setup_unpaper_io(tmpdir: Path, input_file: Path) -> Tuple[Path, Path]:
|
||||
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
||||
with Image.open(input_file) as im:
|
||||
im_modified = False
|
||||
if im.mode not in SUFFIXES:
|
||||
log.info("Converting image to other colorspace")
|
||||
try:
|
||||
if im.mode == 'P' and len(im.getcolors()) == 2:
|
||||
im = im.convert(mode='1')
|
||||
else:
|
||||
im = im.convert(mode='RGB')
|
||||
except IOError as e:
|
||||
raise MissingDependencyError(
|
||||
"Could not convert image with type " + im.mode
|
||||
) from e
|
||||
else:
|
||||
im_modified = True
|
||||
try:
|
||||
suffix = SUFFIXES[im.mode]
|
||||
except KeyError:
|
||||
raise MissingDependencyError(
|
||||
"Failed to convert image to a supported format."
|
||||
) from None
|
||||
|
||||
if im_modified or input_file.suffix != '.pnm':
|
||||
input_pnm = tmpdir / 'input.pnm'
|
||||
im.save(input_pnm, format='PPM')
|
||||
else:
|
||||
# No changes, PNG input, just use the file we already have
|
||||
input_pnm = input_file
|
||||
output_pnm = tmpdir / f'output{suffix}'
|
||||
return input_pnm, output_pnm
|
||||
|
||||
|
||||
def run(
|
||||
input_file: Path, output_file: Path, *, dpi: DecFloat, mode_args: List[str]
|
||||
) -> None:
|
||||
args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args
|
||||
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
input_pnm, output_pnm = _setup_unpaper_io(Path(tmpdir), input_file)
|
||||
|
||||
# To prevent any shenanigans from accepting arbitrary parameters in
|
||||
# --unpaper-args, we:
|
||||
# 1) run with cwd set to a tmpdir with only unpaper's files
|
||||
# 2) forbid the use of '/' in arguments, to prevent changing paths
|
||||
# 3) append absolute paths for the input and output file
|
||||
# This should ensure that a user cannot clobber some other file with
|
||||
# their unpaper arguments (whether intentionally or otherwise)
|
||||
args_unpaper.extend([os.fspath(input_pnm), os.fspath(output_pnm)])
|
||||
external_run(
|
||||
args_unpaper,
|
||||
close_fds=True,
|
||||
check=True,
|
||||
stderr=STDOUT, # unpaper writes logging output to stdout and stderr
|
||||
stdout=PIPE, # and cannot send file output to stdout
|
||||
cwd=tmpdir,
|
||||
logs_errors_to_stdout=True,
|
||||
)
|
||||
try:
|
||||
with Image.open(output_pnm) as imout:
|
||||
imout.save(output_file, dpi=(dpi, dpi))
|
||||
except (FileNotFoundError, OSError):
|
||||
raise SubprocessOutputError(
|
||||
"unpaper: failed to produce the expected output file. "
|
||||
+ " Called with: "
|
||||
+ str(args_unpaper)
|
||||
) from None
|
||||
|
||||
|
||||
def validate_custom_args(args: str) -> List[str]:
|
||||
unpaper_args = shlex.split(args)
|
||||
if any(('/' in arg or arg == '.' or arg == '..') for arg in unpaper_args):
|
||||
raise ValueError('No filenames allowed in --unpaper-args')
|
||||
return unpaper_args
|
||||
|
||||
|
||||
def clean(
|
||||
input_file: Path,
|
||||
output_file: Path,
|
||||
*,
|
||||
dpi: DecFloat,
|
||||
unpaper_args: Optional[List[str]] = None,
|
||||
):
|
||||
default_args = [
|
||||
'--layout',
|
||||
'none',
|
||||
'--mask-scan-size',
|
||||
'100', # don't blank out narrow columns
|
||||
'--no-border-align', # don't align visible content to borders
|
||||
'--no-mask-center', # don't center visible content within page
|
||||
'--no-grayfilter', # don't remove light gray areas
|
||||
'--no-blackfilter', # don't remove solid black areas
|
||||
'--no-deskew', # don't deskew
|
||||
]
|
||||
if not unpaper_args:
|
||||
unpaper_args = default_args
|
||||
run(input_file, output_file, dpi=dpi, mode_args=unpaper_args)
|
||||
314
src/ocrmypdf/_graft.py
Normal file
314
src/ocrmypdf/_graft.py
Normal file
@@ -0,0 +1,314 @@
|
||||
# © 2018 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import logging
|
||||
import uuid
|
||||
from contextlib import suppress
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pikepdf
|
||||
from pikepdf.objects import Dictionary, Name
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
MAX_REPLACE_PAGES = 100
|
||||
|
||||
|
||||
def _ensure_dictionary(obj, name):
|
||||
if name not in obj:
|
||||
obj[name] = Dictionary({})
|
||||
return obj[name]
|
||||
|
||||
|
||||
def _update_resources(*, obj, font, font_key, procset):
|
||||
"""Update this obj's fonts with a reference to the Glyphless font.
|
||||
|
||||
obj can be a page or Form XObject.
|
||||
"""
|
||||
|
||||
resources = _ensure_dictionary(obj, Name.Resources)
|
||||
fonts = _ensure_dictionary(resources, Name.Font)
|
||||
if font_key is not None and font_key not in fonts:
|
||||
fonts[font_key] = font
|
||||
|
||||
# Reassign /ProcSet to one that just lists everything - ProcSet is
|
||||
# obsolete and doesn't matter but recommended for old viewer support
|
||||
if procset:
|
||||
resources['/ProcSet'] = procset
|
||||
|
||||
|
||||
def strip_invisible_text(pdf, page):
|
||||
stream = []
|
||||
in_text_obj = False
|
||||
render_mode = 0
|
||||
text_objects = []
|
||||
|
||||
page.page_contents_coalesce()
|
||||
for operands, operator in pikepdf.parse_content_stream(page, ''):
|
||||
if not in_text_obj:
|
||||
if operator == pikepdf.Operator('BT'):
|
||||
in_text_obj = True
|
||||
render_mode = 0
|
||||
text_objects.append((operands, operator))
|
||||
else:
|
||||
stream.append((operands, operator))
|
||||
else:
|
||||
if operator == pikepdf.Operator('Tr'):
|
||||
render_mode = operands[0]
|
||||
text_objects.append((operands, operator))
|
||||
if operator == pikepdf.Operator('ET'):
|
||||
in_text_obj = False
|
||||
if render_mode != 3:
|
||||
stream.extend(text_objects)
|
||||
text_objects.clear()
|
||||
|
||||
def convert(op):
|
||||
try:
|
||||
return op.unparse()
|
||||
except AttributeError:
|
||||
return str(op).encode('ascii')
|
||||
|
||||
lines = []
|
||||
|
||||
for operands, operator in stream:
|
||||
if operator == pikepdf.Operator('INLINE IMAGE'):
|
||||
iim = operands[0]
|
||||
line = iim.unparse()
|
||||
else:
|
||||
line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse()
|
||||
lines.append(line)
|
||||
|
||||
content_stream = b'\n'.join(lines)
|
||||
page.Contents = pikepdf.Stream(pdf, content_stream)
|
||||
|
||||
|
||||
class OcrGrafter:
|
||||
def __init__(self, context):
|
||||
self.context = context
|
||||
self.path_base = context.origin
|
||||
|
||||
self.pdf_base = pikepdf.open(self.path_base)
|
||||
self.font, self.font_key = None, None
|
||||
|
||||
self.pdfinfo = context.pdfinfo
|
||||
self.output_file = context.get_path('graft_layers.pdf')
|
||||
|
||||
self.procset = self.pdf_base.make_indirect(
|
||||
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
|
||||
)
|
||||
|
||||
self.emplacements = 1
|
||||
self.interim_count = 0
|
||||
|
||||
def graft_page(
|
||||
self,
|
||||
*,
|
||||
pageno: int,
|
||||
image: Optional[Path],
|
||||
textpdf: Optional[Path],
|
||||
autorotate_correction: int,
|
||||
):
|
||||
if textpdf and not self.font:
|
||||
self.font, self.font_key = self._find_font(textpdf)
|
||||
|
||||
emplaced_page = False
|
||||
content_rotation = self.pdfinfo[pageno].rotation
|
||||
path_image = Path(image).resolve() if image else None
|
||||
if path_image is not None and path_image != self.path_base:
|
||||
# We are updating the old page with a rasterized PDF of the new
|
||||
# page (without changing objgen, to preserve references)
|
||||
log.debug("Emplacement update")
|
||||
with pikepdf.open(image) as pdf_image:
|
||||
self.emplacements += 1
|
||||
foreign_image_page = pdf_image.pages[0]
|
||||
self.pdf_base.pages.append(foreign_image_page)
|
||||
local_image_page = self.pdf_base.pages[-1]
|
||||
self.pdf_base.pages[pageno].emplace(local_image_page)
|
||||
del self.pdf_base.pages[-1]
|
||||
emplaced_page = True
|
||||
|
||||
# Calculate if the text is misaligned compared to the content
|
||||
if emplaced_page:
|
||||
content_rotation = autorotate_correction
|
||||
text_rotation = autorotate_correction
|
||||
text_misaligned = (text_rotation - content_rotation) % 360
|
||||
log.debug(
|
||||
f"Text rotation: (text, autorotate, content) -> text misalignment = "
|
||||
f"({text_rotation}, {autorotate_correction}, {content_rotation}) -> {text_misaligned}"
|
||||
)
|
||||
|
||||
if textpdf and self.font:
|
||||
# Graft the text layer onto this page, whether new or old, possibly
|
||||
# rotating the text layer by the amount is misaligned.
|
||||
strip_old = self.context.options.redo_ocr
|
||||
self._graft_text_layer(
|
||||
page_num=pageno + 1,
|
||||
textpdf=textpdf,
|
||||
font=self.font,
|
||||
font_key=self.font_key,
|
||||
text_rotation=text_misaligned,
|
||||
procset=self.procset,
|
||||
strip_old_text=strip_old,
|
||||
)
|
||||
|
||||
# Correct the overall page rotation if needed, now that the text and content
|
||||
# are aligned
|
||||
page_rotation = (content_rotation - autorotate_correction) % 360
|
||||
self.pdf_base.pages[pageno].Rotate = page_rotation
|
||||
log.debug(
|
||||
f"Page rotation: (content, auto) -> page = "
|
||||
f"({content_rotation}, {autorotate_correction}) -> {page_rotation}"
|
||||
)
|
||||
if self.emplacements % MAX_REPLACE_PAGES == 0:
|
||||
self.save_and_reload()
|
||||
|
||||
def save_and_reload(self):
|
||||
"""Save and reload the Pdf.
|
||||
|
||||
This will keep a lid on our memory usage for very large files. Attach
|
||||
the font to page 1 even if page 1 doesn't use it, so we have a way to get it
|
||||
back.
|
||||
"""
|
||||
|
||||
page0 = self.pdf_base.pages[0]
|
||||
_update_resources(
|
||||
obj=page0, font=self.font, font_key=self.font_key, procset=self.procset
|
||||
)
|
||||
|
||||
# We cannot read and write the same file, that will corrupt it
|
||||
# but we don't to keep more copies than we need to. Delete intermediates.
|
||||
# {interim_count} is the opened file we were updating
|
||||
# {interim_count - 1} can be deleted
|
||||
# {interim_count + 1} is the new file will produce and open
|
||||
old_file = self.output_file.with_suffix(f'.working{self.interim_count - 1}.pdf')
|
||||
if not self.context.options.keep_temporary_files:
|
||||
with suppress(FileNotFoundError):
|
||||
old_file.unlink()
|
||||
|
||||
next_file = self.output_file.with_suffix(
|
||||
f'.working{self.interim_count + 1}.pdf'
|
||||
)
|
||||
self.pdf_base.save(next_file)
|
||||
self.pdf_base.close()
|
||||
|
||||
self.pdf_base = pikepdf.open(next_file)
|
||||
self.procset = self.pdf_base.pages[0].Resources.ProcSet
|
||||
self.font, self.font_key = None, None # Ensure we reacquire this information
|
||||
self.interim_count += 1
|
||||
|
||||
def finalize(self):
|
||||
self.pdf_base.save(self.output_file)
|
||||
self.pdf_base.close()
|
||||
return self.output_file
|
||||
|
||||
def _find_font(self, text):
|
||||
"""Copy a font from the filename text into pdf_base"""
|
||||
|
||||
font, font_key = None, None
|
||||
possible_font_names = ('/f-0-0', '/F1')
|
||||
try:
|
||||
with pikepdf.open(text) as pdf_text:
|
||||
try:
|
||||
pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
|
||||
except (AttributeError, IndexError, KeyError):
|
||||
return None, None
|
||||
pdf_text_font = None
|
||||
for f in possible_font_names:
|
||||
pdf_text_font = pdf_text_fonts.get(f, None)
|
||||
if pdf_text_font is not None:
|
||||
font_key = f
|
||||
break
|
||||
if pdf_text_font:
|
||||
font = self.pdf_base.copy_foreign(pdf_text_font)
|
||||
return font, font_key
|
||||
except (FileNotFoundError, pikepdf.PdfError):
|
||||
# PdfError occurs if a 0-length file is written e.g. due to OCR timeout
|
||||
return None, None
|
||||
|
||||
def _graft_text_layer(
|
||||
self,
|
||||
*,
|
||||
page_num: int,
|
||||
textpdf: Path,
|
||||
font: pikepdf.Object,
|
||||
font_key: pikepdf.Object,
|
||||
procset: pikepdf.Object,
|
||||
text_rotation: int,
|
||||
strip_old_text: bool,
|
||||
):
|
||||
"""Insert the text layer from text page 0 on to pdf_base at page_num"""
|
||||
|
||||
log.debug("Grafting")
|
||||
if Path(textpdf).stat().st_size == 0:
|
||||
return
|
||||
|
||||
# This is a pointer indicating a specific page in the base file
|
||||
with pikepdf.open(textpdf) as pdf_text:
|
||||
pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()
|
||||
|
||||
base_page = self.pdf_base.pages.p(page_num)
|
||||
|
||||
# The text page always will be oriented up by this stage but the original
|
||||
# content may have a rotation applied. Wrap the text stream with a rotation
|
||||
# so it will be oriented the same way as the rest of the page content.
|
||||
# (Previous versions OCRmyPDF rotated the content layer to match the text.)
|
||||
mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
|
||||
wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
|
||||
|
||||
mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
|
||||
wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
|
||||
|
||||
translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
|
||||
untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
|
||||
corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
|
||||
# -rotation because the input is a clockwise angle and this formula
|
||||
# uses CCW
|
||||
text_rotation = -text_rotation % 360
|
||||
rotate = pikepdf.PdfMatrix().rotated(text_rotation)
|
||||
|
||||
# Because of rounding of DPI, we might get a text layer that is not
|
||||
# identically sized to the target page. Scale to adjust. Normally this
|
||||
# is within 0.998.
|
||||
if text_rotation in (90, 270):
|
||||
wt, ht = ht, wt
|
||||
scale_x = wp / wt
|
||||
scale_y = hp / ht
|
||||
|
||||
# log.debug('%r', scale_x, scale_y)
|
||||
scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
|
||||
|
||||
# Translate the text so it is centered at (0, 0), rotate it there, adjust
|
||||
# for a size different between initial and text PDF, then untranslate, and
|
||||
# finally move the lower left corner to match the mediabox
|
||||
ctm = translate @ rotate @ scale @ untranslate @ corner
|
||||
|
||||
base_resources = _ensure_dictionary(base_page, Name.Resources)
|
||||
base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
|
||||
text_xobj_name = Name('/' + str(uuid.uuid4()))
|
||||
xobj = self.pdf_base.make_stream(pdf_text_contents)
|
||||
base_xobjs[text_xobj_name] = xobj
|
||||
xobj.Type = Name.XObject
|
||||
xobj.Subtype = Name.Form
|
||||
xobj.FormType = 1
|
||||
xobj.BBox = mediabox
|
||||
_update_resources(
|
||||
obj=xobj, font=font, font_key=font_key, procset=[Name.PDF]
|
||||
)
|
||||
|
||||
pdf_draw_xobj = (
|
||||
(b'q %s cm\n' % ctm.encode()) + (b'%s Do\n' % text_xobj_name) + b'\nQ\n'
|
||||
)
|
||||
new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj)
|
||||
|
||||
if strip_old_text:
|
||||
strip_invisible_text(self.pdf_base, base_page)
|
||||
|
||||
base_page.page_contents_add(new_text_layer, prepend=True)
|
||||
|
||||
_update_resources(
|
||||
obj=base_page, font=font, font_key=font_key, procset=procset
|
||||
)
|
||||
@@ -1,83 +1,103 @@
|
||||
# © 2018 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from contextlib import suppress
|
||||
from multiprocessing.managers import SyncManager
|
||||
from argparse import Namespace
|
||||
from copy import copy
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
from .pdfinfo import PdfInfo
|
||||
from pluggy import PluginManager
|
||||
|
||||
from ocrmypdf.pdfinfo import PdfInfo
|
||||
from ocrmypdf.pdfinfo.info import PageInfo
|
||||
|
||||
|
||||
class JobContext:
|
||||
"""Holds our context for a particular run of the pipeline
|
||||
class PdfContext:
|
||||
"""Holds the context for a particular run of the pipeline."""
|
||||
|
||||
A multiprocessing manager effectively creates a separate process
|
||||
that keeps the master job context object. Other threads access
|
||||
job context via multiprocessing proxy objects.
|
||||
options: Namespace #: The specified options for processing this PDF.
|
||||
origin: Path #: The filename of the original input file.
|
||||
pdfinfo: PdfInfo #: Detailed data for this PDF.
|
||||
plugin_manager: PluginManager #: PluginManager for processing the current PDF.
|
||||
|
||||
While this would naturally lend itself @property's it seems to make
|
||||
a little more sense to use functions to make it explicitly that the
|
||||
invocation requires marshalling data across a process boundary.
|
||||
def __init__(
|
||||
self,
|
||||
options: Namespace,
|
||||
work_folder: Path,
|
||||
origin: Path,
|
||||
pdfinfo: PdfInfo,
|
||||
plugin_manager,
|
||||
):
|
||||
self.options = options
|
||||
self.work_folder = work_folder
|
||||
self.origin = origin
|
||||
self.pdfinfo = pdfinfo
|
||||
self.plugin_manager = plugin_manager
|
||||
|
||||
def get_path(self, name: str) -> Path:
|
||||
"""Generate a ``Path`` for an intermediate file involved in processing.
|
||||
|
||||
The path will be in a temporary folder that is common for all processing
|
||||
of this particular PDF.
|
||||
"""
|
||||
return self.work_folder / name
|
||||
|
||||
def get_page_contexts(self) -> Iterator['PageContext']:
|
||||
"""Get all ``PageContext`` for this PDF."""
|
||||
npages = len(self.pdfinfo)
|
||||
for n in range(npages):
|
||||
yield PageContext(self, n)
|
||||
|
||||
|
||||
class PageContext:
|
||||
"""Holds our context for a page.
|
||||
|
||||
Must be pickable, so stores only intrinsic/simple data elements or those
|
||||
capable of their serializing themselves via ``__getstate__``.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.pdfinfo = None
|
||||
self.options = None
|
||||
self.work_folder = None
|
||||
self.rotations = {}
|
||||
options: Namespace #: The specified options for processing this PDF.
|
||||
origin: Path #: The filename of the original input file.
|
||||
pageno: int #: This page number (zero-based).
|
||||
pageinfo: PageInfo #: Information on this page.
|
||||
plugin_manager: PluginManager #: PluginManager for processing the current PDF.
|
||||
|
||||
def generate_pdfinfo(self, infile):
|
||||
self.pdfinfo = PdfInfo(infile)
|
||||
def __init__(self, pdf_context: PdfContext, pageno):
|
||||
self.work_folder = pdf_context.work_folder
|
||||
self.origin = pdf_context.origin
|
||||
self.options = pdf_context.options
|
||||
self.pageno = pageno
|
||||
self.pageinfo = pdf_context.pdfinfo[pageno]
|
||||
self.plugin_manager = pdf_context.plugin_manager
|
||||
|
||||
def get_pdfinfo(self):
|
||||
"What we know about the input PDF"
|
||||
return self.pdfinfo
|
||||
def get_path(self, name: str) -> Path:
|
||||
"""Generate a ``Path`` for a file that is part of processing this page.
|
||||
|
||||
def set_pdfinfo(self, pdfinfo):
|
||||
self.pdfinfo = pdfinfo
|
||||
The path will be based in a common temporary folder and have a prefix based
|
||||
on the page number.
|
||||
"""
|
||||
return self.work_folder / ("%06d_%s" % (self.pageno + 1, name))
|
||||
|
||||
def get_options(self):
|
||||
return self.options
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
|
||||
def set_options(self, options):
|
||||
self.options = options
|
||||
|
||||
def get_work_folder(self):
|
||||
return self.work_folder
|
||||
|
||||
def set_work_folder(self, work_folder):
|
||||
self.work_folder = work_folder
|
||||
|
||||
def get_rotation(self, pageno):
|
||||
return self.rotations.get(pageno, 0)
|
||||
|
||||
def set_rotation(self, pageno, value):
|
||||
self.rotations[pageno] = value
|
||||
state['options'] = copy(self.options)
|
||||
if not isinstance(state['options'].input_file, (str, bytes, os.PathLike)):
|
||||
state['options'].input_file = 'stream'
|
||||
if not isinstance(state['options'].output_file, (str, bytes, os.PathLike)):
|
||||
state['options'].output_file = 'stream'
|
||||
return state
|
||||
|
||||
|
||||
class JobContextManager(SyncManager):
|
||||
pass
|
||||
|
||||
|
||||
def cleanup_working_files(work_folder, options):
|
||||
def cleanup_working_files(work_folder: Path, options: Namespace):
|
||||
if options.keep_temporary_files:
|
||||
print(f"Temporary working files saved at:\n{work_folder}", file=sys.stderr)
|
||||
print(f"Temporary working files retained at:\n{work_folder}", file=sys.stderr)
|
||||
else:
|
||||
with suppress(FileNotFoundError):
|
||||
shutil.rmtree(work_folder)
|
||||
shutil.rmtree(work_folder, ignore_errors=True)
|
||||
|
||||
50
src/ocrmypdf/_logging.py
Normal file
50
src/ocrmypdf/_logging.py
Normal file
@@ -0,0 +1,50 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from contextlib import suppress
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
class PageNumberFilter(logging.Filter):
|
||||
def filter(self, record):
|
||||
pageno = getattr(record, 'pageno', None)
|
||||
if isinstance(pageno, int):
|
||||
record.pageno = f'{pageno:5d} '
|
||||
elif pageno is None:
|
||||
record.pageno = ''
|
||||
return True
|
||||
|
||||
|
||||
class TqdmConsole:
|
||||
"""Wrapper to log messages in a way that is compatible with tqdm progress bar
|
||||
|
||||
This routes log messages through tqdm so that it can print them above the
|
||||
progress bar, and then refresh the progress bar, rather than overwriting
|
||||
it which looks messy.
|
||||
|
||||
For some reason Python 3.6 prints extra empty messages from time to time,
|
||||
so we suppress those.
|
||||
"""
|
||||
|
||||
def __init__(self, file):
|
||||
self.file = file
|
||||
self.py36 = sys.version_info[0:2] == (3, 6)
|
||||
|
||||
def write(self, msg):
|
||||
# When no progress bar is active, tqdm.write() routes to print()
|
||||
if self.py36:
|
||||
if msg.strip() != '':
|
||||
tqdm.write(msg.rstrip(), end='\n', file=self.file)
|
||||
else:
|
||||
tqdm.write(msg.rstrip(), end='\n', file=self.file)
|
||||
|
||||
def flush(self):
|
||||
with suppress(AttributeError):
|
||||
self.file.flush()
|
||||
File diff suppressed because it is too large
Load Diff
122
src/ocrmypdf/_plugin_manager.py
Normal file
122
src/ocrmypdf/_plugin_manager.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import argparse
|
||||
import importlib
|
||||
import importlib.util
|
||||
import pkgutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
import pluggy
|
||||
|
||||
import ocrmypdf.builtin_plugins
|
||||
from ocrmypdf import pluginspec
|
||||
from ocrmypdf.cli import get_parser, plugins_only_parser
|
||||
|
||||
|
||||
class OcrmypdfPluginManager(pluggy.PluginManager):
|
||||
"""pluggy.PluginManager that can fork.
|
||||
|
||||
Capable of reconstructing itself in child workers.
|
||||
|
||||
Arguments:
|
||||
setup_func: callback that initializes the plugin manager with all
|
||||
standard plugins
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
plugins: List[Union[str, Path]],
|
||||
builtins: bool = True,
|
||||
**kwargs,
|
||||
):
|
||||
self.__init_args = args
|
||||
self.__init_kwargs = kwargs
|
||||
self.__plugins = plugins
|
||||
self.__builtins = builtins
|
||||
super().__init__(*args, **kwargs)
|
||||
self.setup_plugins()
|
||||
|
||||
def __getstate__(self):
|
||||
state = dict(
|
||||
init_args=self.__init_args,
|
||||
plugins=self.__plugins,
|
||||
builtins=self.__builtins,
|
||||
init_kwargs=self.__init_kwargs,
|
||||
)
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__init__(
|
||||
*state['init_args'],
|
||||
plugins=state['plugins'],
|
||||
builtins=state['builtins'],
|
||||
**state['init_kwargs'],
|
||||
)
|
||||
|
||||
def setup_plugins(self):
|
||||
self.add_hookspecs(pluginspec)
|
||||
|
||||
# 1. Register builtins
|
||||
if self.__builtins:
|
||||
for module in sorted(
|
||||
pkgutil.iter_modules(ocrmypdf.builtin_plugins.__path__)
|
||||
):
|
||||
name = f'ocrmypdf.builtin_plugins.{module.name}'
|
||||
module = importlib.import_module(name)
|
||||
self.register(module)
|
||||
|
||||
# 2. Install semfree if needed
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from multiprocessing.synchronize import SemLock
|
||||
|
||||
del SemLock
|
||||
except ImportError:
|
||||
self.register(importlib.import_module('ocrmypdf.extra_plugins.semfree'))
|
||||
|
||||
# 3. Register setuptools plugins
|
||||
self.load_setuptools_entrypoints('ocrmypdf')
|
||||
|
||||
# 4. Register plugins specified on command line
|
||||
for name in self.__plugins:
|
||||
if isinstance(name, Path) or name.endswith('.py'):
|
||||
# Import by filename
|
||||
module_name = Path(name).stem
|
||||
spec = importlib.util.spec_from_file_location(module_name, name)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
sys.modules[module_name] = module
|
||||
spec.loader.exec_module(module)
|
||||
else:
|
||||
# Import by dotted module name
|
||||
module = importlib.import_module(name)
|
||||
self.register(module)
|
||||
|
||||
|
||||
def get_plugin_manager(plugins: List[Union[str, Path]], builtins=True):
|
||||
pm = OcrmypdfPluginManager(
|
||||
project_name='ocrmypdf',
|
||||
plugins=plugins,
|
||||
builtins=builtins,
|
||||
)
|
||||
return pm
|
||||
|
||||
|
||||
def get_parser_options_plugins(
|
||||
args,
|
||||
) -> Tuple[argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager]:
|
||||
pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
|
||||
plugin_manager = get_plugin_manager(pre_options.plugins)
|
||||
|
||||
parser = get_parser()
|
||||
plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
|
||||
|
||||
options = parser.parse_args(args=args)
|
||||
return parser, options, plugin_manager
|
||||
426
src/ocrmypdf/_sync.py
Normal file
426
src/ocrmypdf/_sync.py
Normal file
@@ -0,0 +1,426 @@
|
||||
# © 2016 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import logging
|
||||
import logging.handlers
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from tempfile import mkdtemp
|
||||
from typing import List, NamedTuple, Optional, Tuple
|
||||
|
||||
import PIL
|
||||
|
||||
from ocrmypdf._concurrent import Executor, setup_executor
|
||||
from ocrmypdf._graft import OcrGrafter
|
||||
from ocrmypdf._jobcontext import PageContext, PdfContext, cleanup_working_files
|
||||
from ocrmypdf._logging import PageNumberFilter
|
||||
from ocrmypdf._pipeline import (
|
||||
convert_to_pdfa,
|
||||
copy_final,
|
||||
create_ocr_image,
|
||||
create_pdf_page_from_image,
|
||||
create_visible_page_jpg,
|
||||
generate_postscript_stub,
|
||||
get_orientation_correction,
|
||||
get_pdfinfo,
|
||||
is_ocr_required,
|
||||
merge_sidecars,
|
||||
metadata_fixup,
|
||||
ocr_engine_hocr,
|
||||
ocr_engine_textonly_pdf,
|
||||
optimize_pdf,
|
||||
preprocess_clean,
|
||||
preprocess_deskew,
|
||||
preprocess_remove_background,
|
||||
rasterize,
|
||||
rasterize_preview,
|
||||
render_hocr_page,
|
||||
should_visible_page_image_use_jpg,
|
||||
triage,
|
||||
validate_pdfinfo_options,
|
||||
)
|
||||
from ocrmypdf._plugin_manager import get_plugin_manager
|
||||
from ocrmypdf._validation import (
|
||||
check_requested_output_file,
|
||||
create_input_file,
|
||||
report_output_file_size,
|
||||
)
|
||||
from ocrmypdf.exceptions import ExitCode, ExitCodeException
|
||||
from ocrmypdf.helpers import (
|
||||
NeverRaise,
|
||||
available_cpu_count,
|
||||
check_pdf,
|
||||
pikepdf_enable_mmap,
|
||||
samefile,
|
||||
)
|
||||
from ocrmypdf.pdfa import file_claims_pdfa
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PageResult(NamedTuple): # pylint: disable=inherit-non-class
|
||||
pageno: int
|
||||
pdf_page_from_image: Optional[Path]
|
||||
ocr: Optional[Path]
|
||||
text: Optional[Path]
|
||||
orientation_correction: int
|
||||
|
||||
|
||||
tls = threading.local()
|
||||
tls.pageno = None
|
||||
|
||||
|
||||
old_factory = logging.getLogRecordFactory()
|
||||
|
||||
|
||||
def record_factory(*args, **kwargs):
|
||||
record = old_factory(*args, **kwargs)
|
||||
if hasattr(tls, 'pageno'):
|
||||
record.pageno = tls.pageno
|
||||
return record
|
||||
|
||||
|
||||
logging.setLogRecordFactory(record_factory)
|
||||
|
||||
|
||||
def preprocess(
|
||||
page_context: PageContext,
|
||||
image: Path,
|
||||
remove_background: bool,
|
||||
deskew: bool,
|
||||
clean: bool,
|
||||
) -> Path:
|
||||
if remove_background:
|
||||
image = preprocess_remove_background(image, page_context)
|
||||
if deskew:
|
||||
image = preprocess_deskew(image, page_context)
|
||||
if clean:
|
||||
image = preprocess_clean(image, page_context)
|
||||
return image
|
||||
|
||||
|
||||
def make_intermediate_images(
|
||||
page_context: PageContext, orientation_correction: int
|
||||
) -> Tuple[Path, Optional[Path]]:
|
||||
options = page_context.options
|
||||
|
||||
ocr_image = preprocess_out = None
|
||||
rasterize_out = rasterize(
|
||||
page_context.origin,
|
||||
page_context,
|
||||
correction=orientation_correction,
|
||||
remove_vectors=False,
|
||||
)
|
||||
|
||||
if not any([options.clean, options.clean_final, options.remove_vectors]):
|
||||
ocr_image = preprocess_out = preprocess(
|
||||
page_context,
|
||||
rasterize_out,
|
||||
options.remove_background,
|
||||
options.deskew,
|
||||
clean=False,
|
||||
)
|
||||
else:
|
||||
if not options.lossless_reconstruction:
|
||||
preprocess_out = preprocess(
|
||||
page_context,
|
||||
rasterize_out,
|
||||
options.remove_background,
|
||||
options.deskew,
|
||||
clean=options.clean_final,
|
||||
)
|
||||
if options.remove_vectors:
|
||||
rasterize_ocr_out = rasterize(
|
||||
page_context.origin,
|
||||
page_context,
|
||||
correction=orientation_correction,
|
||||
remove_vectors=True,
|
||||
output_tag='_ocr',
|
||||
)
|
||||
else:
|
||||
rasterize_ocr_out = rasterize_out
|
||||
|
||||
if (
|
||||
preprocess_out
|
||||
and rasterize_ocr_out == rasterize_out
|
||||
and options.clean == options.clean_final
|
||||
):
|
||||
# Optimization: image for OCR is identical to presentation image
|
||||
ocr_image = preprocess_out
|
||||
else:
|
||||
ocr_image = preprocess(
|
||||
page_context,
|
||||
rasterize_ocr_out,
|
||||
options.remove_background,
|
||||
options.deskew,
|
||||
clean=options.clean,
|
||||
)
|
||||
return ocr_image, preprocess_out
|
||||
|
||||
|
||||
def exec_page_sync(page_context: PageContext):
|
||||
options = page_context.options
|
||||
tls.pageno = page_context.pageno + 1
|
||||
|
||||
if not is_ocr_required(page_context):
|
||||
return PageResult(
|
||||
pageno=page_context.pageno,
|
||||
pdf_page_from_image=None,
|
||||
ocr=None,
|
||||
text=None,
|
||||
orientation_correction=0,
|
||||
)
|
||||
|
||||
orientation_correction = 0
|
||||
if options.rotate_pages:
|
||||
# Rasterize
|
||||
rasterize_preview_out = rasterize_preview(page_context.origin, page_context)
|
||||
orientation_correction = get_orientation_correction(
|
||||
rasterize_preview_out, page_context
|
||||
)
|
||||
|
||||
ocr_image, preprocess_out = make_intermediate_images(
|
||||
page_context, orientation_correction
|
||||
)
|
||||
ocr_image_out = create_ocr_image(ocr_image, page_context)
|
||||
|
||||
pdf_page_from_image_out = None
|
||||
if not options.lossless_reconstruction:
|
||||
assert preprocess_out
|
||||
visible_image_out = preprocess_out
|
||||
if should_visible_page_image_use_jpg(page_context.pageinfo):
|
||||
visible_image_out = create_visible_page_jpg(visible_image_out, page_context)
|
||||
filtered_image = page_context.plugin_manager.hook.filter_page_image(
|
||||
page=page_context, image_filename=visible_image_out
|
||||
)
|
||||
if filtered_image:
|
||||
visible_image_out = filtered_image
|
||||
pdf_page_from_image_out = create_pdf_page_from_image(
|
||||
visible_image_out, page_context, orientation_correction
|
||||
)
|
||||
|
||||
if options.pdf_renderer.startswith('hocr'):
|
||||
(hocr_out, text_out) = ocr_engine_hocr(ocr_image_out, page_context)
|
||||
ocr_out = render_hocr_page(hocr_out, page_context)
|
||||
elif options.pdf_renderer == 'sandwich':
|
||||
(ocr_out, text_out) = ocr_engine_textonly_pdf(ocr_image_out, page_context)
|
||||
else:
|
||||
raise NotImplementedError(f"pdf_renderer {options.pdf_renderer}")
|
||||
|
||||
return PageResult(
|
||||
pageno=page_context.pageno,
|
||||
pdf_page_from_image=pdf_page_from_image_out,
|
||||
ocr=ocr_out,
|
||||
text=text_out,
|
||||
orientation_correction=orientation_correction,
|
||||
)
|
||||
|
||||
|
||||
def post_process(pdf_file, context: PdfContext, executor: Executor):
|
||||
pdf_out = pdf_file
|
||||
if context.options.output_type.startswith('pdfa'):
|
||||
ps_stub_out = generate_postscript_stub(context)
|
||||
pdf_out = convert_to_pdfa(pdf_out, ps_stub_out, context)
|
||||
|
||||
pdf_out = metadata_fixup(pdf_out, context)
|
||||
return optimize_pdf(pdf_out, context, executor)
|
||||
|
||||
|
||||
def worker_init(max_pixels: int):
|
||||
# In Windows, child process will not inherit our change to this value in
|
||||
# the parent process, so ensure workers get it set. Not needed when running
|
||||
# threaded, but harmless to set again.
|
||||
PIL.Image.MAX_IMAGE_PIXELS = max_pixels
|
||||
pikepdf_enable_mmap()
|
||||
|
||||
|
||||
def exec_concurrent(context: PdfContext, executor: Executor):
|
||||
"""Execute the pipeline concurrently"""
|
||||
|
||||
# Run exec_page_sync on every page context
|
||||
options = context.options
|
||||
max_workers = min(len(context.pdfinfo), options.jobs)
|
||||
if max_workers > 1:
|
||||
log.info("Start processing %d pages concurrently", max_workers)
|
||||
|
||||
sidecars: List[Optional[Path]] = [None] * len(context.pdfinfo)
|
||||
ocrgraft = OcrGrafter(context)
|
||||
|
||||
def update_page(result: PageResult, pbar):
|
||||
try:
|
||||
tls.pageno = result.pageno + 1
|
||||
sidecars[result.pageno] = result.text
|
||||
pbar.update()
|
||||
ocrgraft.graft_page(
|
||||
pageno=result.pageno,
|
||||
image=result.pdf_page_from_image,
|
||||
textpdf=result.ocr,
|
||||
autorotate_correction=result.orientation_correction,
|
||||
)
|
||||
pbar.update()
|
||||
finally:
|
||||
tls.pageno = None
|
||||
|
||||
executor(
|
||||
use_threads=options.use_threads,
|
||||
max_workers=max_workers,
|
||||
tqdm_kwargs=dict(
|
||||
total=(2 * len(context.pdfinfo)),
|
||||
desc='OCR' if options.tesseract_timeout > 0 else 'Image processing',
|
||||
unit='page',
|
||||
unit_scale=0.5,
|
||||
disable=not options.progress_bar,
|
||||
),
|
||||
worker_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
|
||||
task=exec_page_sync,
|
||||
task_arguments=context.get_page_contexts(),
|
||||
task_finished=update_page,
|
||||
)
|
||||
|
||||
# Output sidecar text
|
||||
if options.sidecar:
|
||||
text = merge_sidecars(sidecars, context)
|
||||
# Copy text file to destination
|
||||
copy_final(text, options.sidecar, context)
|
||||
|
||||
# Merge layers to one single pdf
|
||||
pdf = ocrgraft.finalize()
|
||||
|
||||
# PDF/A and metadata
|
||||
log.info("Postprocessing...")
|
||||
pdf = post_process(pdf, context, executor)
|
||||
|
||||
# Copy PDF file to destination
|
||||
copy_final(pdf, options.output_file, context)
|
||||
|
||||
|
||||
def configure_debug_logging(log_filename: Path, prefix: str = ''):
|
||||
"""
|
||||
Create a debug log file at a specified location.
|
||||
|
||||
Arguments:
|
||||
log_filename: Where to the put the log file.
|
||||
prefix: The logging domain prefix that should be sent to the log.
|
||||
"""
|
||||
log_file_handler = logging.FileHandler(log_filename, delay=True)
|
||||
log_file_handler.setLevel(logging.DEBUG)
|
||||
formatter = logging.Formatter(
|
||||
'[%(asctime)s] - %(name)s - %(levelname)7s -%(pageno)s %(message)s'
|
||||
)
|
||||
log_file_handler.setFormatter(formatter)
|
||||
log_file_handler.addFilter(PageNumberFilter())
|
||||
logging.getLogger(prefix).addHandler(log_file_handler)
|
||||
return log_file_handler
|
||||
|
||||
|
||||
def run_pipeline(options, *, plugin_manager, api=False):
|
||||
# Any changes to options will not take effect for options that are already
|
||||
# bound to function parameters in the pipeline. (For example
|
||||
# options.input_file, options.pdf_renderer are already bound.)
|
||||
if not options.jobs:
|
||||
options.jobs = available_cpu_count()
|
||||
if not plugin_manager:
|
||||
plugin_manager = get_plugin_manager(options.plugins)
|
||||
|
||||
work_folder = Path(mkdtemp(prefix="ocrmypdf.io."))
|
||||
debug_log_handler = None
|
||||
if (
|
||||
(options.keep_temporary_files or options.verbose >= 1)
|
||||
and not os.environ.get('PYTEST_CURRENT_TEST', '')
|
||||
and not api
|
||||
):
|
||||
# Debug log for command line interface only with verbose output
|
||||
# See https://github.com/pytest-dev/pytest/issues/5502 for why we skip this
|
||||
# when pytest is running
|
||||
debug_log_handler = configure_debug_logging(
|
||||
Path(work_folder) / "debug.log"
|
||||
) # pragma: no cover
|
||||
|
||||
pikepdf_enable_mmap()
|
||||
|
||||
executor = setup_executor(plugin_manager)
|
||||
try:
|
||||
check_requested_output_file(options)
|
||||
start_input_file, original_filename = create_input_file(options, work_folder)
|
||||
|
||||
# Triage image or pdf
|
||||
origin_pdf = triage(
|
||||
original_filename, start_input_file, work_folder / 'origin.pdf', options
|
||||
)
|
||||
|
||||
# Gather pdfinfo and create context
|
||||
pdfinfo = get_pdfinfo(
|
||||
origin_pdf,
|
||||
executor=executor,
|
||||
detailed_analysis=options.redo_ocr,
|
||||
progbar=options.progress_bar,
|
||||
max_workers=options.jobs if not options.use_threads else 1, # To help debug
|
||||
check_pages=options.pages,
|
||||
)
|
||||
|
||||
context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
|
||||
|
||||
# Validate options are okay for this pdf
|
||||
validate_pdfinfo_options(context)
|
||||
|
||||
# Execute the pipeline
|
||||
exec_concurrent(context, executor)
|
||||
|
||||
if options.output_file == '-':
|
||||
log.info("Output sent to stdout")
|
||||
elif (
|
||||
hasattr(options.output_file, 'writable') and options.output_file.writable()
|
||||
):
|
||||
log.info("Output written to stream")
|
||||
elif samefile(options.output_file, os.devnull):
|
||||
pass # Say nothing when sending to dev null
|
||||
else:
|
||||
if options.output_type.startswith('pdfa'):
|
||||
pdfa_info = file_claims_pdfa(options.output_file)
|
||||
if pdfa_info['pass']:
|
||||
log.info(
|
||||
"Output file is a %s (as expected)", pdfa_info['conformance']
|
||||
)
|
||||
else:
|
||||
log.warning(
|
||||
"Output file is okay but is not PDF/A (seems to be %s)",
|
||||
pdfa_info['conformance'],
|
||||
)
|
||||
return ExitCode.pdfa_conversion_failed
|
||||
if not check_pdf(options.output_file):
|
||||
log.warning('Output file: The generated PDF is INVALID')
|
||||
return ExitCode.invalid_output_pdf
|
||||
report_output_file_size(options, start_input_file, options.output_file)
|
||||
|
||||
except (KeyboardInterrupt if not api else NeverRaise) as e:
|
||||
if options.verbose >= 1:
|
||||
log.exception("KeyboardInterrupt")
|
||||
else:
|
||||
log.error("KeyboardInterrupt")
|
||||
return ExitCode.ctrl_c
|
||||
except (ExitCodeException if not api else NeverRaise) as e:
|
||||
if str(e):
|
||||
log.error("%s: %s", type(e).__name__, str(e))
|
||||
else:
|
||||
log.error(type(e).__name__)
|
||||
return e.exit_code
|
||||
except (Exception if not api else NeverRaise) as e: # pylint: disable=broad-except
|
||||
log.exception("An exception occurred while executing the pipeline")
|
||||
return ExitCode.other_error
|
||||
finally:
|
||||
if debug_log_handler:
|
||||
try:
|
||||
debug_log_handler.close()
|
||||
log.removeHandler(debug_log_handler)
|
||||
except EnvironmentError as e:
|
||||
print(e, file=sys.stderr)
|
||||
cleanup_working_files(work_folder, options)
|
||||
|
||||
return ExitCode.ok
|
||||
431
src/ocrmypdf/_validation.py
Normal file
431
src/ocrmypdf/_validation.py
Normal file
@@ -0,0 +1,431 @@
|
||||
#!/usr/bin/env python3
|
||||
# © 2015-17 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import locale
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from shutil import copyfileobj
|
||||
from typing import List, Set, Tuple, Union
|
||||
|
||||
import pikepdf
|
||||
import PIL
|
||||
|
||||
from ocrmypdf._exec import jbig2enc, pngquant, unpaper
|
||||
from ocrmypdf._unicodefun import verify_python3_env
|
||||
from ocrmypdf.exceptions import (
|
||||
BadArgsError,
|
||||
InputFileError,
|
||||
MissingDependencyError,
|
||||
OutputFileAccessError,
|
||||
)
|
||||
from ocrmypdf.helpers import (
|
||||
is_file_writable,
|
||||
is_iterable_notstr,
|
||||
monotonic,
|
||||
safe_symlink,
|
||||
)
|
||||
from ocrmypdf.subprocess import check_external_program
|
||||
|
||||
# -------------
|
||||
# External dependencies
|
||||
|
||||
HOCR_OK_LANGS = frozenset(['eng', 'deu', 'spa', 'ita', 'por'])
|
||||
DEFAULT_LANGUAGE = 'eng' # Enforce English hegemony
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# --------
|
||||
# Critical environment tests
|
||||
verify_python3_env()
|
||||
|
||||
|
||||
def check_platform():
|
||||
if os.name == 'nt' and sys.maxsize <= 2 ** 32: # pragma: no cover
|
||||
# 32-bit interpreter on Windows
|
||||
log.error(
|
||||
"You are running OCRmyPDF in a 32-bit (x86) Python interpreter."
|
||||
"Please use a 64-bit (x86-64) version of Python."
|
||||
)
|
||||
|
||||
|
||||
def check_options_languages(options, ocr_engine_languages):
|
||||
if not options.languages:
|
||||
options.languages = {DEFAULT_LANGUAGE}
|
||||
system_lang = locale.getlocale()[0]
|
||||
if system_lang and not system_lang.startswith('en'):
|
||||
log.debug("No language specified; assuming --language %s", DEFAULT_LANGUAGE)
|
||||
if not ocr_engine_languages:
|
||||
return
|
||||
if not options.languages.issubset(ocr_engine_languages):
|
||||
msg = (
|
||||
f"OCR engine does not have language data for the following "
|
||||
"requested languages: \n"
|
||||
)
|
||||
for lang in options.languages - ocr_engine_languages:
|
||||
msg += lang + '\n'
|
||||
raise MissingDependencyError(msg)
|
||||
|
||||
|
||||
def check_options_output(options):
|
||||
is_latin = options.languages.issubset(HOCR_OK_LANGS)
|
||||
|
||||
if options.pdf_renderer.startswith('hocr') and not is_latin:
|
||||
msg = (
|
||||
"The 'hocr' PDF renderer is known to cause problems with one "
|
||||
"or more of the languages in your document. Use "
|
||||
"--pdf-renderer auto (the default) to avoid this issue."
|
||||
)
|
||||
log.warning(msg)
|
||||
|
||||
lossless_reconstruction = False
|
||||
if not any(
|
||||
(
|
||||
options.deskew,
|
||||
options.clean_final,
|
||||
options.force_ocr,
|
||||
options.remove_background,
|
||||
)
|
||||
):
|
||||
lossless_reconstruction = True
|
||||
options.lossless_reconstruction = lossless_reconstruction
|
||||
|
||||
if not options.lossless_reconstruction and options.redo_ocr:
|
||||
raise BadArgsError(
|
||||
"--redo-ocr is not currently compatible with --deskew, "
|
||||
"--clean-final, and --remove-background"
|
||||
)
|
||||
|
||||
|
||||
def check_options_sidecar(options):
|
||||
if options.sidecar == '\0':
|
||||
if options.output_file == '-':
|
||||
raise BadArgsError(
|
||||
"--sidecar filename must be specified when output file is stdout."
|
||||
)
|
||||
options.sidecar = options.output_file + '.txt'
|
||||
if options.sidecar == options.input_file or options.sidecar == options.output_file:
|
||||
raise BadArgsError(
|
||||
"--sidecar file must be different from the input and output files"
|
||||
)
|
||||
|
||||
|
||||
def check_options_preprocessing(options):
|
||||
if options.clean_final:
|
||||
options.clean = True
|
||||
if options.unpaper_args and not options.clean:
|
||||
raise BadArgsError("--clean is required for --unpaper-args")
|
||||
if options.clean:
|
||||
check_external_program(
|
||||
program='unpaper',
|
||||
package='unpaper',
|
||||
version_checker=unpaper.version,
|
||||
need_version='6.1',
|
||||
required_for=['--clean, --clean-final'],
|
||||
)
|
||||
try:
|
||||
if options.unpaper_args:
|
||||
options.unpaper_args = unpaper.validate_custom_args(
|
||||
options.unpaper_args
|
||||
)
|
||||
except Exception as e:
|
||||
raise BadArgsError("--unpaper-args: " + str(e)) from e
|
||||
|
||||
|
||||
def _pages_from_ranges(ranges: str) -> Set[int]:
|
||||
if is_iterable_notstr(ranges):
|
||||
return set(ranges)
|
||||
pages: List[int] = []
|
||||
page_groups = ranges.replace(' ', '').split(',')
|
||||
for g in page_groups:
|
||||
if not g:
|
||||
continue
|
||||
try:
|
||||
start, end = g.split('-')
|
||||
except ValueError:
|
||||
pages.append(int(g) - 1)
|
||||
else:
|
||||
try:
|
||||
new_pages = list(range(int(start) - 1, int(end)))
|
||||
if not new_pages:
|
||||
raise BadArgsError(f"invalid page subrange '{start}-{end}'")
|
||||
pages.extend(new_pages)
|
||||
except ValueError:
|
||||
raise BadArgsError("invalid page range") from None
|
||||
|
||||
if not pages:
|
||||
raise BadArgsError(
|
||||
f"The string of page ranges '{ranges}' did not contain any recognizable "
|
||||
f"page ranges."
|
||||
)
|
||||
|
||||
if not monotonic(pages):
|
||||
log.warning(
|
||||
"List of pages to process contains duplicate pages, or pages that are "
|
||||
"out of order"
|
||||
)
|
||||
if any(page < 0 for page in pages):
|
||||
raise BadArgsError("pages refers to a page number less than 1")
|
||||
|
||||
log.debug("OCRing only these pages: %s", pages)
|
||||
return set(pages)
|
||||
|
||||
|
||||
def check_options_ocr_behavior(options):
|
||||
exclusive_options = sum(
|
||||
[
|
||||
(1 if opt else 0)
|
||||
for opt in (options.force_ocr, options.skip_text, options.redo_ocr)
|
||||
]
|
||||
)
|
||||
if exclusive_options >= 2:
|
||||
raise BadArgsError("Choose only one of --force-ocr, --skip-text, --redo-ocr.")
|
||||
if options.pages:
|
||||
options.pages = _pages_from_ranges(options.pages)
|
||||
|
||||
|
||||
def check_options_optimizing(options):
|
||||
if options.optimize >= 2:
|
||||
check_external_program(
|
||||
program='pngquant',
|
||||
package='pngquant',
|
||||
version_checker=pngquant.version,
|
||||
need_version='2.0.1',
|
||||
required_for='--optimize {2,3}',
|
||||
)
|
||||
|
||||
if options.optimize >= 2:
|
||||
# Although we use JBIG2 for optimize=1, don't nag about it unless the
|
||||
# user is asking for more optimization
|
||||
check_external_program(
|
||||
program='jbig2',
|
||||
package='jbig2enc',
|
||||
version_checker=jbig2enc.version,
|
||||
need_version='0.28',
|
||||
required_for='--optimize {2,3} | --jbig2-lossy',
|
||||
recommended=True if not options.jbig2_lossy else False,
|
||||
)
|
||||
|
||||
if options.optimize == 0 and any(
|
||||
[options.jbig2_lossy, options.png_quality, options.jpeg_quality]
|
||||
):
|
||||
log.warning(
|
||||
"The arguments --jbig2-lossy, --png-quality, and --jpeg-quality "
|
||||
"will be ignored because --optimize=0."
|
||||
)
|
||||
|
||||
|
||||
def check_options_advanced(options):
|
||||
if options.pdfa_image_compression != 'auto' and not options.output_type.startswith(
|
||||
'pdfa'
|
||||
):
|
||||
log.warning(
|
||||
"--pdfa-image-compression argument only applies when "
|
||||
"--output-type is one of 'pdfa', 'pdfa-1', or 'pdfa-2'"
|
||||
)
|
||||
|
||||
|
||||
def check_options_metadata(options):
|
||||
docinfo = [options.title, options.author, options.keywords, options.subject]
|
||||
for s in (m for m in docinfo if m):
|
||||
for c in s:
|
||||
if unicodedata.category(c) == 'Co' or ord(c) >= 0x10000:
|
||||
raise ValueError(
|
||||
"One of the metadata strings contains "
|
||||
"an unsupported Unicode character: '{}' (U+{})".format(
|
||||
c, hex(ord(c))[2:].upper()
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def check_options_pillow(options):
|
||||
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
|
||||
if PIL.Image.MAX_IMAGE_PIXELS == 0:
|
||||
PIL.Image.MAX_IMAGE_PIXELS = None
|
||||
|
||||
|
||||
def _check_options(options, plugin_manager, ocr_engine_languages):
|
||||
check_platform()
|
||||
check_options_languages(options, ocr_engine_languages)
|
||||
check_options_metadata(options)
|
||||
check_options_output(options)
|
||||
check_options_sidecar(options)
|
||||
check_options_preprocessing(options)
|
||||
check_options_ocr_behavior(options)
|
||||
check_options_optimizing(options)
|
||||
check_options_advanced(options)
|
||||
check_options_pillow(options)
|
||||
plugin_manager.hook.check_options(options=options)
|
||||
|
||||
|
||||
def check_options(options, plugin_manager):
|
||||
ocr_engine_languages = plugin_manager.hook.get_ocr_engine().languages(options)
|
||||
_check_options(options, plugin_manager, ocr_engine_languages)
|
||||
|
||||
|
||||
def check_closed_streams(options): # pragma: no cover
|
||||
"""Work around Python issue with multiprocessing forking on closed streams
|
||||
|
||||
https://bugs.python.org/issue28326
|
||||
|
||||
Attempting to a fork/exec a new Python process when any of std{in,out,err}
|
||||
are closed or not flushable for some reason may raise an exception.
|
||||
Fix this by opening devnull if the handle seems to be closed. Do this
|
||||
globally to avoid tracking places all places that fork.
|
||||
|
||||
Seems to be specific to multiprocessing.Process not all Python process
|
||||
forkers.
|
||||
|
||||
The error actually occurs when the stream object is not flushable,
|
||||
but replacing an open stream object that is not flushable with
|
||||
/dev/null is a bad idea since it will create a silent failure. Replacing
|
||||
a closed handle with /dev/null seems safe.
|
||||
|
||||
"""
|
||||
|
||||
if sys.version_info[0:3] >= (3, 6, 4):
|
||||
return True # Issued fixed in Python 3.6.4+
|
||||
|
||||
if sys.stderr is None:
|
||||
sys.stderr = open(os.devnull, 'w')
|
||||
|
||||
if sys.stdin is None:
|
||||
if options.input_file == '-':
|
||||
log.error("Trying to read from stdin but stdin seems closed")
|
||||
return False
|
||||
sys.stdin = open(os.devnull, 'r')
|
||||
|
||||
if sys.stdout is None:
|
||||
if options.output_file == '-':
|
||||
# Can't replace stdout if the user is piping
|
||||
# If this case can even happen, it must be some kind of weird
|
||||
# stream.
|
||||
log.error(
|
||||
"Output was set to stdout '-' but the stream attached to "
|
||||
"stdout does not support the flush() system call. This "
|
||||
"will fail."
|
||||
)
|
||||
return False
|
||||
sys.stdout = open(os.devnull, 'w')
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def create_input_file(options, work_folder: Path) -> Tuple[Path, str]:
|
||||
if options.input_file == '-':
|
||||
# stdin
|
||||
log.info('reading file from standard input')
|
||||
target = work_folder / 'stdin'
|
||||
with open(target, 'wb') as stream_buffer:
|
||||
copyfileobj(sys.stdin.buffer, stream_buffer)
|
||||
return target, "stdin"
|
||||
elif hasattr(options.input_file, 'readable'):
|
||||
if not options.input_file.readable():
|
||||
raise InputFileError("Input file stream is not readable")
|
||||
log.info('reading file from input stream')
|
||||
target = work_folder / 'stream'
|
||||
with open(target, 'wb') as stream_buffer:
|
||||
copyfileobj(options.input_file, stream_buffer)
|
||||
return target, "stream"
|
||||
else:
|
||||
try:
|
||||
target = work_folder / 'origin'
|
||||
safe_symlink(options.input_file, target)
|
||||
return target, os.fspath(options.input_file)
|
||||
except FileNotFoundError:
|
||||
msg = f"File not found - {options.input_file}"
|
||||
if Path('/.dockerenv').exists(): # pragma: no cover
|
||||
msg += (
|
||||
"\nDocker cannot your working directory unless you "
|
||||
"explicitly share it with the Docker container and set up"
|
||||
"permissions correctly.\n"
|
||||
"You may find it easier to use stdin/stdout:"
|
||||
"\n"
|
||||
"\tdocker run -i --rm jbarlow83/ocrmypdf - - <input.pdf >output.pdf\n"
|
||||
)
|
||||
raise InputFileError(msg)
|
||||
|
||||
|
||||
def check_requested_output_file(options):
|
||||
if options.output_file == '-':
|
||||
if sys.stdout.isatty():
|
||||
raise BadArgsError(
|
||||
"Output was set to stdout '-' but it looks like stdout "
|
||||
"is connected to a terminal. Please redirect stdout to a "
|
||||
"file."
|
||||
)
|
||||
elif hasattr(options.output_file, 'writable'):
|
||||
if not options.output_file.writable():
|
||||
raise OutputFileAccessError("Output stream is not writable")
|
||||
elif not is_file_writable(options.output_file):
|
||||
raise OutputFileAccessError(
|
||||
f"Output file location ({options.output_file}) is not a writable file."
|
||||
)
|
||||
|
||||
|
||||
def report_output_file_size(options, input_file, output_file):
|
||||
try:
|
||||
output_size = Path(output_file).stat().st_size
|
||||
input_size = Path(input_file).stat().st_size
|
||||
except FileNotFoundError:
|
||||
return # Outputting to stream or something
|
||||
with pikepdf.open(output_file) as p:
|
||||
# Overhead constants obtained by estimating amount of data added by OCR
|
||||
# PDF/A conversion, and possible XMP metadata addition, with compression
|
||||
FILE_OVERHEAD = 4000
|
||||
OCR_PER_PAGE_OVERHEAD = 3000
|
||||
reasonable_overhead = FILE_OVERHEAD + OCR_PER_PAGE_OVERHEAD * len(p.pages)
|
||||
ratio = output_size / input_size
|
||||
reasonable_ratio = output_size / (input_size + reasonable_overhead)
|
||||
if reasonable_ratio < 1.35 or input_size < 25000:
|
||||
return # Seems fine
|
||||
|
||||
reasons = []
|
||||
image_preproc = {
|
||||
'deskew',
|
||||
'clean_final',
|
||||
'remove_background',
|
||||
'oversample',
|
||||
'force_ocr',
|
||||
}
|
||||
for arg in image_preproc:
|
||||
if getattr(options, arg, False):
|
||||
reasons.append(
|
||||
f"The argument --{arg.replace('_', '-')} was issued, causing transcoding."
|
||||
)
|
||||
|
||||
if options.optimize == 0:
|
||||
reasons.append("Optimization was disabled.")
|
||||
else:
|
||||
image_optimizers = {
|
||||
'jbig2': jbig2enc.available(),
|
||||
'pngquant': pngquant.available(),
|
||||
}
|
||||
for name, available in image_optimizers.items():
|
||||
if not available:
|
||||
reasons.append(
|
||||
f"The optional dependency '{name}' was not found, so some image "
|
||||
f"optimizations could not be attempted."
|
||||
)
|
||||
if options.output_type.startswith('pdfa'):
|
||||
reasons.append("PDF/A conversion was enabled. (Try `--output-type pdf`.)")
|
||||
if options.plugins:
|
||||
reasons.append("Plugins were used.")
|
||||
|
||||
if reasons:
|
||||
explanation = "Possible reasons for this include:\n" + '\n'.join(reasons) + "\n"
|
||||
else:
|
||||
explanation = "No reason for this increase is known. Please report this issue."
|
||||
|
||||
log.warning(
|
||||
f"The output file size is {ratio:.2f}× larger than the input file.\n"
|
||||
f"{explanation}"
|
||||
)
|
||||
13
src/ocrmypdf/_version.py
Normal file
13
src/ocrmypdf/_version.py
Normal file
@@ -0,0 +1,13 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import pkg_resources
|
||||
|
||||
PROGRAM_NAME = 'ocrmypdf'
|
||||
|
||||
# Official PEP 396
|
||||
__version__ = pkg_resources.get_distribution('ocrmypdf').version
|
||||
@@ -1,332 +0,0 @@
|
||||
# © 2018 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from contextlib import suppress
|
||||
from itertools import groupby
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import pikepdf
|
||||
|
||||
from .exec import tesseract
|
||||
from .helpers import flatten_groups, page_number
|
||||
|
||||
|
||||
MAX_REPLACE_PAGES = int(os.environ.get('_OCRMYPDF_MAX_REPLACE_PAGES', 100))
|
||||
|
||||
|
||||
def _update_page_resources(*, page, font, font_key, procset):
|
||||
"""Update this page's fonts with a reference to the Glyphless font"""
|
||||
|
||||
if '/Resources' not in page:
|
||||
page['/Resources'] = pikepdf.Dictionary({})
|
||||
resources = page['/Resources']
|
||||
try:
|
||||
fonts = resources['/Font']
|
||||
except KeyError:
|
||||
fonts = pikepdf.Dictionary({})
|
||||
if font_key is not None and font_key not in fonts:
|
||||
fonts[font_key] = font
|
||||
resources['/Font'] = fonts
|
||||
|
||||
# Reassign /ProcSet to one that just lists everything - ProcSet is
|
||||
# obsolete and doesn't matter but recommended for old viewer support
|
||||
resources['/ProcSet'] = procset
|
||||
|
||||
|
||||
def strip_invisible_text(pdf, page, log):
|
||||
stream = []
|
||||
in_text_obj = False
|
||||
render_mode = 0
|
||||
text_objects = []
|
||||
|
||||
page.page_contents_coalesce()
|
||||
for operands, operator in pikepdf.parse_content_stream(page, ''):
|
||||
if not in_text_obj:
|
||||
if operator == pikepdf.Operator('BT'):
|
||||
in_text_obj = True
|
||||
render_mode = 0
|
||||
text_objects.append((operands, operator))
|
||||
else:
|
||||
stream.append((operands, operator))
|
||||
else:
|
||||
if operator == pikepdf.Operator('Tr'):
|
||||
render_mode = operands[0]
|
||||
text_objects.append((operands, operator))
|
||||
if operator == pikepdf.Operator('ET'):
|
||||
in_text_obj = False
|
||||
if render_mode != 3:
|
||||
stream.extend(text_objects)
|
||||
text_objects.clear()
|
||||
|
||||
def convert(op):
|
||||
try:
|
||||
return op.unparse()
|
||||
except AttributeError:
|
||||
return str(op).encode('ascii')
|
||||
|
||||
lines = []
|
||||
|
||||
for operands, operator in stream:
|
||||
if operator == pikepdf.Operator('INLINE IMAGE'):
|
||||
iim = operands[0]
|
||||
line = iim.unparse()
|
||||
else:
|
||||
line = b' '.join(convert(op) for op in operands) + b' ' + operator.unparse()
|
||||
lines.append(line)
|
||||
|
||||
content_stream = b'\n'.join(lines)
|
||||
page.Contents = pikepdf.Stream(pdf, content_stream)
|
||||
|
||||
|
||||
def _weave_layers_graft(
|
||||
*, pdf_base, page_num, text, font, font_key, procset, rotation, strip_old_text, log
|
||||
):
|
||||
"""Insert the text layer from text page 0 on to pdf_base at page_num"""
|
||||
|
||||
log.debug("Grafting")
|
||||
if Path(text).stat().st_size == 0:
|
||||
return
|
||||
|
||||
# This is a pointer indicating a specific page in the base file
|
||||
pdf_text = pikepdf.open(text)
|
||||
pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()
|
||||
|
||||
if not tesseract.has_textonly_pdf():
|
||||
# If we don't have textonly_pdf, edit the stream to delete the
|
||||
# instruction to draw the image Tesseract generated, which we do not
|
||||
# use.
|
||||
stream = bytearray(pdf_text_contents)
|
||||
pattern = b'/Im1 Do'
|
||||
idx = stream.find(pattern)
|
||||
stream[idx : (idx + len(pattern))] = b' ' * len(pattern)
|
||||
pdf_text_contents = bytes(stream)
|
||||
|
||||
base_page = pdf_base.pages.p(page_num)
|
||||
|
||||
# The text page always will be oriented up by this stage but the original
|
||||
# content may have a rotation applied. Wrap the text stream with a rotation
|
||||
# so it will be oriented the same way as the rest of the page content.
|
||||
# (Previous versions OCRmyPDF rotated the content layer to match the text.)
|
||||
mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
|
||||
wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
|
||||
|
||||
mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
|
||||
wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]
|
||||
|
||||
translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
|
||||
untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
|
||||
# -rotation because the input is a clockwise angle and this formula
|
||||
# uses CCW
|
||||
rotation = -rotation % 360
|
||||
rotate = pikepdf.PdfMatrix().rotated(rotation)
|
||||
|
||||
# Because of rounding of DPI, we might get a text layer that is not
|
||||
# identically sized to the target page. Scale to adjust. Normally this
|
||||
# is within 0.998.
|
||||
if rotation in (90, 270):
|
||||
wt, ht = ht, wt
|
||||
scale_x = wp / wt
|
||||
scale_y = hp / ht
|
||||
|
||||
log.debug('%r', (scale_x, scale_y))
|
||||
scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)
|
||||
|
||||
# Translate the text so it is centered at (0, 0), rotate it there, adjust
|
||||
# for a size different between initial and text PDF, then untranslate
|
||||
ctm = translate @ rotate @ scale @ untranslate
|
||||
|
||||
pdf_text_contents = b'q %s cm\n' % ctm.encode() + pdf_text_contents + b'\nQ\n'
|
||||
|
||||
new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents)
|
||||
|
||||
if strip_old_text:
|
||||
strip_invisible_text(pdf_base, base_page, log)
|
||||
|
||||
base_page.page_contents_add(new_text_layer, prepend=True)
|
||||
|
||||
_update_page_resources(
|
||||
page=base_page, font=font, font_key=font_key, procset=procset
|
||||
)
|
||||
pdf_text.close()
|
||||
|
||||
|
||||
def _find_font(text, pdf_base):
|
||||
"""Copy a font from the filename text into pdf_base"""
|
||||
|
||||
font, font_key = None, None
|
||||
possible_font_names = ('/f-0-0', '/F1')
|
||||
try:
|
||||
with pikepdf.open(text) as pdf_text:
|
||||
try:
|
||||
pdf_text_fonts = pdf_text.pages[0].Resources.get('/Font', {})
|
||||
except (AttributeError, IndexError, KeyError):
|
||||
return None, None
|
||||
for f in possible_font_names:
|
||||
pdf_text_font = pdf_text_fonts.get(f, None)
|
||||
if pdf_text_font is not None:
|
||||
font_key = f
|
||||
break
|
||||
if pdf_text_font:
|
||||
font = pdf_base.copy_foreign(pdf_text_font)
|
||||
return font, font_key
|
||||
except (FileNotFoundError, pikepdf.PdfError):
|
||||
# PdfError occurs if a 0-length file is written e.g. due to OCR timeout
|
||||
return None, None
|
||||
|
||||
|
||||
def weave_layers(infiles, output_file, log, context):
|
||||
"""Apply text layer and/or image layer changes to baseline file
|
||||
|
||||
This is where the magic happens. infiles will be the main PDF to modify,
|
||||
and optional .text.pdf and .image-layer.pdf files, organized however ruffus
|
||||
organizes them.
|
||||
|
||||
From .text.pdf, we copy the content stream (which contains the Tesseract
|
||||
OCR results), and rotate it into place. The first time we do this, we also
|
||||
copy the GlyphlessFont, and then reference that font again.
|
||||
|
||||
For .image-layer.pdf, we check if this is a "pointer" to the original file,
|
||||
or a new file. If a new file, we replace the page and remember that we
|
||||
replaced this page.
|
||||
|
||||
Every 100 open files, we save intermediate results, to avoid any resource
|
||||
limits, since pikepdf/qpdf need to keep a lot of open file handles in the
|
||||
background. When objects are copied from one file to another qpdf, qpdf
|
||||
doesn't actually copy the data until asked to write, so all the resources
|
||||
it may need to remain available.
|
||||
|
||||
For completeness, we set up a /ProcSet on every page, although it's
|
||||
unlikely any PDF viewer cares about this anymore.
|
||||
|
||||
"""
|
||||
|
||||
def input_sorter(key):
|
||||
try:
|
||||
return page_number(key)
|
||||
except ValueError:
|
||||
return -1
|
||||
|
||||
flat_inputs = sorted(flatten_groups(infiles), key=input_sorter)
|
||||
groups = groupby(flat_inputs, key=input_sorter)
|
||||
|
||||
# Extract first item
|
||||
_, basegroup = next(groups)
|
||||
base = list(basegroup)[0]
|
||||
path_base = Path(base).resolve()
|
||||
pdf_base = pikepdf.open(path_base)
|
||||
font, font_key, procset = None, None, None
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
|
||||
procset = pdf_base.make_indirect(
|
||||
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
|
||||
)
|
||||
|
||||
emplacements = 1
|
||||
interim_count = 0
|
||||
|
||||
# Iterate rest
|
||||
for page_num, layers in groups:
|
||||
layers = list(layers)
|
||||
log.debug(page_num)
|
||||
log.debug(layers)
|
||||
|
||||
text = next((ii for ii in layers if ii.endswith('.text.pdf')), None)
|
||||
image = next((ii for ii in layers if ii.endswith('.image-layer.pdf')), None)
|
||||
|
||||
if text and not font:
|
||||
font, font_key = _find_font(text, pdf_base)
|
||||
|
||||
emplaced_page = False
|
||||
content_rotation = pdfinfo[page_num - 1].rotation
|
||||
|
||||
path_image = Path(image).resolve() if image else None
|
||||
if path_image is not None and path_image != path_base:
|
||||
# We are updating the old page with a rasterized PDF of the new
|
||||
# page (without changing objgen, to preserve references)
|
||||
log.debug("Emplacement update")
|
||||
with pikepdf.open(image) as pdf_image:
|
||||
emplacements += 1
|
||||
foreign_image_page = pdf_image.pages[0]
|
||||
pdf_base.pages.append(foreign_image_page)
|
||||
local_image_page = pdf_base.pages[-1]
|
||||
pdf_base.pages[page_num - 1].emplace(local_image_page)
|
||||
del pdf_base.pages[-1]
|
||||
emplaced_page = True
|
||||
|
||||
autorotate_correction = context.get_rotation(page_num - 1)
|
||||
if emplaced_page:
|
||||
content_rotation = autorotate_correction
|
||||
text_rotation = autorotate_correction
|
||||
text_misaligned = (text_rotation - content_rotation) % 360
|
||||
log.debug(
|
||||
'%r',
|
||||
[text_rotation, autorotate_correction, text_misaligned, content_rotation],
|
||||
)
|
||||
|
||||
if text and font:
|
||||
# Graft the text layer onto this page, whether new or old
|
||||
strip_old = context.get_options().redo_ocr
|
||||
_weave_layers_graft(
|
||||
pdf_base=pdf_base,
|
||||
page_num=page_num,
|
||||
text=text,
|
||||
font=font,
|
||||
font_key=font_key,
|
||||
rotation=text_misaligned,
|
||||
procset=procset,
|
||||
strip_old_text=strip_old,
|
||||
log=log,
|
||||
)
|
||||
|
||||
# Correct the rotation if applicable
|
||||
pdf_base.pages[page_num - 1].Rotate = (
|
||||
content_rotation - autorotate_correction
|
||||
) % 360
|
||||
|
||||
if emplacements % MAX_REPLACE_PAGES == 0:
|
||||
# Periodically save and reload the Pdf object. This will keep a
|
||||
# lid on our memory usage for very large files. Attach the font to
|
||||
# page 1 even if page 1 doesn't use it, so we have a way to get it
|
||||
# back.
|
||||
# TODO refactor this to outside the loop
|
||||
page0 = pdf_base.pages[0]
|
||||
_update_page_resources(
|
||||
page=page0, font=font, font_key=font_key, procset=procset
|
||||
)
|
||||
|
||||
# We cannot read and write the same file, that will corrupt it
|
||||
# but we don't to keep more copies than we need to. Delete intermediates.
|
||||
# {interim_count} is the opened file we were updateing
|
||||
# {interim_count - 1} can be deleted
|
||||
# {interim_count + 1} is the new file will produce and open
|
||||
old_file = output_file + f'_working{interim_count - 1}.pdf'
|
||||
if not context.get_options().keep_temporary_files:
|
||||
with suppress(FileNotFoundError):
|
||||
os.unlink(old_file)
|
||||
|
||||
next_file = output_file + f'_working{interim_count + 1}.pdf'
|
||||
pdf_base.save(next_file)
|
||||
pdf_base.close()
|
||||
|
||||
pdf_base = pikepdf.open(next_file)
|
||||
procset = pdf_base.pages[0].Resources.ProcSet
|
||||
font, font_key = None, None # Ensure we reacquire this information
|
||||
interim_count += 1
|
||||
|
||||
pdf_base.save(output_file)
|
||||
pdf_base.close()
|
||||
340
src/ocrmypdf/api.py
Normal file
340
src/ocrmypdf/api.py
Normal file
@@ -0,0 +1,340 @@
|
||||
# © 2019 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
from enum import IntEnum
|
||||
from io import IOBase
|
||||
from pathlib import Path
|
||||
from typing import AnyStr, BinaryIO, Iterable, Optional, Union
|
||||
from warnings import warn
|
||||
|
||||
from ocrmypdf._logging import ( # pylint: disable=unused-import
|
||||
PageNumberFilter,
|
||||
TqdmConsole,
|
||||
)
|
||||
from ocrmypdf._plugin_manager import get_plugin_manager
|
||||
from ocrmypdf._sync import run_pipeline
|
||||
from ocrmypdf._validation import check_options
|
||||
from ocrmypdf.cli import ArgumentParser, get_parser
|
||||
from ocrmypdf.helpers import is_iterable_notstr
|
||||
|
||||
try:
|
||||
import coloredlogs
|
||||
except ModuleNotFoundError:
|
||||
coloredlogs = None
|
||||
|
||||
|
||||
StrPath = Union[os.PathLike, AnyStr]
|
||||
PathOrIO = Union[BinaryIO, StrPath]
|
||||
|
||||
_api_lock = threading.Lock()
|
||||
|
||||
|
||||
class Verbosity(IntEnum):
|
||||
"""Verbosity level for configure_logging."""
|
||||
|
||||
quiet = -1 #: Suppress most messages
|
||||
default = 0 #: Default level of logging
|
||||
debug = 1 #: Output ocrmypdf debug messages
|
||||
debug_all = 2 #: More detailed debugging from ocrmypdf and dependent modules
|
||||
|
||||
|
||||
def configure_logging(
|
||||
verbosity: Verbosity,
|
||||
*,
|
||||
progress_bar_friendly: bool = True,
|
||||
manage_root_logger: bool = False,
|
||||
plugin_manager=None,
|
||||
):
|
||||
"""Set up logging.
|
||||
|
||||
Before calling :func:`ocrmypdf.ocr()`, you can use this function to
|
||||
configure logging if you want ocrmypdf's output to look like the ocrmypdf
|
||||
command line interface. It will register log handlers, log filters, and
|
||||
formatters, configure color logging to standard error, and adjust the log
|
||||
levels of third party libraries. Details of this are fine-tuned and subject
|
||||
to change. The ``verbosity`` argument is equivalent to the argument
|
||||
``--verbose`` and applies those settings. If you have a wrapper
|
||||
script for ocrmypdf and you want it to be very similar to ocrmypdf, use this
|
||||
function; if you are using ocrmypdf as part of an application that manages
|
||||
its own logging, you probably do not want this function.
|
||||
|
||||
If this function is not called, ocrmypdf will not configure logging, and it
|
||||
is up to the caller of ``ocrmypdf.ocr()`` to set up logging as it wishes using
|
||||
the Python standard library's logging module. If this function is called,
|
||||
the caller may of course make further adjustments to logging.
|
||||
|
||||
Regardless of whether this function is called, ocrmypdf will perform all of
|
||||
its logging under the ``"ocrmypdf"`` logging namespace. In addition,
|
||||
ocrmypdf imports pdfminer, which logs under ``"pdfminer"``. A library user
|
||||
may wish to configure both; note that pdfminer is extremely chatty at the
|
||||
log level ``logging.INFO``.
|
||||
|
||||
This function does not set up the ``debug.log`` log file that the command
|
||||
line interface does at certain verbosity levels. Applications should configure
|
||||
their own debug logging.
|
||||
|
||||
Args:
|
||||
verbosity: Verbosity level.
|
||||
progress_bar_friendly: If True (the default), install a custom log handler
|
||||
that is compatible with progress bars and colored output.
|
||||
manage_root_logger: Configure the process's root logger.
|
||||
plugin_manager: The plugin manager, used for obtaining the custom log handler.
|
||||
|
||||
Returns:
|
||||
The toplevel logger for ocrmypdf (or the root logger, if we are managing it).
|
||||
"""
|
||||
|
||||
prefix = '' if manage_root_logger else 'ocrmypdf'
|
||||
|
||||
log = logging.getLogger(prefix)
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
console = None
|
||||
if plugin_manager and progress_bar_friendly:
|
||||
console = plugin_manager.hook.get_logging_console()
|
||||
|
||||
if not console:
|
||||
console = logging.StreamHandler(stream=sys.stderr)
|
||||
|
||||
if verbosity < 0:
|
||||
console.setLevel(logging.ERROR)
|
||||
elif verbosity >= 1:
|
||||
console.setLevel(logging.DEBUG)
|
||||
else:
|
||||
console.setLevel(logging.INFO)
|
||||
|
||||
console.addFilter(PageNumberFilter())
|
||||
|
||||
if verbosity >= 2:
|
||||
fmt = '%(levelname)7s %(name)s -%(pageno)s %(message)s'
|
||||
else:
|
||||
fmt = '%(pageno)s%(message)s'
|
||||
|
||||
use_colors = progress_bar_friendly
|
||||
if not coloredlogs:
|
||||
use_colors = False
|
||||
if use_colors:
|
||||
if os.name == 'nt':
|
||||
use_colors = coloredlogs.enable_ansi_support()
|
||||
if use_colors:
|
||||
use_colors = coloredlogs.terminal_supports_colors()
|
||||
if use_colors:
|
||||
formatter = coloredlogs.ColoredFormatter(fmt=fmt)
|
||||
else:
|
||||
formatter = logging.Formatter(fmt=fmt)
|
||||
|
||||
console.setFormatter(formatter)
|
||||
log.addHandler(console)
|
||||
|
||||
if verbosity <= 1:
|
||||
pdfminer_log = logging.getLogger('pdfminer')
|
||||
pdfminer_log.setLevel(logging.ERROR)
|
||||
pil_log = logging.getLogger('PIL')
|
||||
pil_log.setLevel(logging.INFO)
|
||||
|
||||
if manage_root_logger:
|
||||
logging.captureWarnings(True)
|
||||
|
||||
return log
|
||||
|
||||
|
||||
def create_options(
|
||||
*, input_file: PathOrIO, output_file: PathOrIO, parser: ArgumentParser, **kwargs
|
||||
):
|
||||
cmdline = []
|
||||
deferred = []
|
||||
|
||||
for arg, val in kwargs.items():
|
||||
if val is None:
|
||||
continue
|
||||
|
||||
# These arguments with special handling for which we bypass
|
||||
# argparse
|
||||
if arg in {'progress_bar', 'plugins'}:
|
||||
deferred.append((arg, val))
|
||||
continue
|
||||
|
||||
cmd_style_arg = arg.replace('_', '-')
|
||||
|
||||
# Booleans are special: add only if True, omit for False
|
||||
if isinstance(val, bool):
|
||||
if val:
|
||||
cmdline.append(f"--{cmd_style_arg}")
|
||||
continue
|
||||
|
||||
if is_iterable_notstr(val):
|
||||
for elem in val:
|
||||
cmdline.append(f"--{cmd_style_arg}")
|
||||
cmdline.append(elem)
|
||||
continue
|
||||
|
||||
# We have a parameter
|
||||
cmdline.append(f"--{cmd_style_arg}")
|
||||
if isinstance(val, (int, float)):
|
||||
cmdline.append(str(val))
|
||||
elif isinstance(val, str):
|
||||
cmdline.append(val)
|
||||
elif isinstance(val, Path):
|
||||
cmdline.append(str(val))
|
||||
else:
|
||||
raise TypeError(f"{arg}: {val} ({type(val)})")
|
||||
|
||||
if isinstance(input_file, (BinaryIO, IOBase)):
|
||||
cmdline.append('stream://input_file')
|
||||
else:
|
||||
cmdline.append(os.fspath(input_file))
|
||||
if isinstance(output_file, (BinaryIO, IOBase)):
|
||||
cmdline.append('stream://output_file')
|
||||
else:
|
||||
cmdline.append(os.fspath(output_file))
|
||||
|
||||
parser._api_mode = True
|
||||
options = parser.parse_args(cmdline)
|
||||
for keyword, val in deferred:
|
||||
setattr(options, keyword, val)
|
||||
|
||||
if options.input_file == 'stream://input_file':
|
||||
options.input_file = input_file
|
||||
if options.output_file == 'stream://output_file':
|
||||
options.output_file = output_file
|
||||
|
||||
return options
|
||||
|
||||
|
||||
def ocr( # pylint: disable=unused-argument
|
||||
input_file: PathOrIO,
|
||||
output_file: PathOrIO,
|
||||
*,
|
||||
language: Iterable[str] = None,
|
||||
image_dpi: int = None,
|
||||
output_type=None,
|
||||
sidecar: Optional[StrPath] = None,
|
||||
jobs: int = None,
|
||||
use_threads: bool = None,
|
||||
title: str = None,
|
||||
author: str = None,
|
||||
subject: str = None,
|
||||
keywords: str = None,
|
||||
rotate_pages: bool = None,
|
||||
remove_background: bool = None,
|
||||
deskew: bool = None,
|
||||
clean: bool = None,
|
||||
clean_final: bool = None,
|
||||
unpaper_args: str = None,
|
||||
oversample: int = None,
|
||||
remove_vectors: bool = None,
|
||||
threshold: bool = None,
|
||||
force_ocr: bool = None,
|
||||
skip_text: bool = None,
|
||||
redo_ocr: bool = None,
|
||||
skip_big: float = None,
|
||||
optimize: int = None,
|
||||
jpg_quality: int = None,
|
||||
png_quality: int = None,
|
||||
jbig2_lossy: bool = None,
|
||||
jbig2_page_group_size: int = None,
|
||||
pages: str = None,
|
||||
max_image_mpixels: float = None,
|
||||
tesseract_config: Iterable[str] = None,
|
||||
tesseract_pagesegmode: int = None,
|
||||
tesseract_oem: int = None,
|
||||
pdf_renderer=None,
|
||||
tesseract_timeout: float = None,
|
||||
rotate_pages_threshold: float = None,
|
||||
pdfa_image_compression=None,
|
||||
user_words: os.PathLike = None,
|
||||
user_patterns: os.PathLike = None,
|
||||
fast_web_view: float = None,
|
||||
plugins: Iterable[StrPath] = None,
|
||||
plugin_manager=None,
|
||||
keep_temporary_files: bool = None,
|
||||
progress_bar: bool = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Run OCRmyPDF on one PDF or image.
|
||||
|
||||
For most arguments, see documentation for the equivalent command line parameter.
|
||||
A few specific arguments are discussed here:
|
||||
|
||||
Args:
|
||||
use_threads: Use worker threads instead of processes. This reduces
|
||||
performance but may make debugging easier since it is easier to set
|
||||
breakpoints.
|
||||
input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
|
||||
interpreted as file system path to the input file. If the object
|
||||
appears to be a readable stream (with methods such as ``.read()``
|
||||
and ``.seek()``), the object will be read in its entirety and saved to
|
||||
a temporary file. If ``input_file`` is ``"-"``, standard input will be
|
||||
read.
|
||||
output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is
|
||||
interpreted as file system path to the output file. If the object
|
||||
appears to be a writable stream (with methods such as ``.write()`` and
|
||||
``.seek()``), the output will be written to this stream. If
|
||||
``output_file`` is ``"-"``, the output will be written to ``sys.stdout``
|
||||
(provided that standard output does not seem to be a terminal device).
|
||||
When a stream is used as output, whether via a writable object or
|
||||
``"-"``, some final validation steps are not performed (we do not read
|
||||
back the stream after it is written).
|
||||
Raises:
|
||||
ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
|
||||
with the OCR layer.
|
||||
ocrmypdf.MissingDependencyError: If a required dependency program is missing or
|
||||
was not found on PATH.
|
||||
ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
|
||||
could not be read, or some other file type that is not a PDF.
|
||||
ocrmypdf.DpiError: If the input file is an image, but the resolution of the
|
||||
image is not credible (allowing it to proceed would cause poor OCR).
|
||||
ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output
|
||||
file failed.
|
||||
ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital
|
||||
text already, and settings did not tell us to proceed.
|
||||
ocrmypdf.InputFileError: Any other problem with the input file.
|
||||
ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess.
|
||||
ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected).
|
||||
OCRmyPDF does not remove passwords.
|
||||
ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not
|
||||
valid.
|
||||
|
||||
Returns:
|
||||
:class:`ocrmypdf.ExitCode`
|
||||
"""
|
||||
if plugins and plugin_manager:
|
||||
raise ValueError("plugins= and plugin_manager are mutually exclusive")
|
||||
|
||||
if not plugins:
|
||||
plugins = []
|
||||
elif isinstance(plugins, (str, Path)):
|
||||
plugins = [plugins]
|
||||
else:
|
||||
plugins = list(plugins)
|
||||
|
||||
# No new variable names should be assigned until these two steps are run
|
||||
create_options_kwargs = {k: v for k, v in locals().items() if k != 'kwargs'}
|
||||
create_options_kwargs.update(kwargs)
|
||||
|
||||
parser = get_parser()
|
||||
create_options_kwargs['parser'] = parser
|
||||
|
||||
with _api_lock:
|
||||
# We can't allow multiple ocrmypdf.ocr() threads to run in parallel, because
|
||||
# they might install different plugins, and generally speaking we have areas
|
||||
# of code that use global state.
|
||||
|
||||
if not plugin_manager:
|
||||
plugin_manager = get_plugin_manager(plugins)
|
||||
plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member
|
||||
|
||||
if 'verbose' in kwargs:
|
||||
warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().")
|
||||
|
||||
options = create_options(**create_options_kwargs)
|
||||
check_options(options, plugin_manager)
|
||||
return run_pipeline(options=options, plugin_manager=plugin_manager, api=True)
|
||||
9
src/ocrmypdf/builtin_plugins/__init__.py
Normal file
9
src/ocrmypdf/builtin_plugins/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
# This file exists only mark builtin_plugins as a package.
|
||||
# The plugin manager will not load it, so anything defined here may not be
|
||||
# processed as a module.
|
||||
172
src/ocrmypdf/builtin_plugins/concurrency.py
Normal file
172
src/ocrmypdf/builtin_plugins/concurrency.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import logging
|
||||
import logging.handlers
|
||||
import multiprocessing
|
||||
import os
|
||||
import queue
|
||||
import signal
|
||||
import sys
|
||||
import threading
|
||||
from contextlib import suppress
|
||||
from multiprocessing import Pool as ProcessPool
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from typing import Callable, Iterable, Union
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from ocrmypdf import Executor, hookimpl
|
||||
from ocrmypdf._logging import TqdmConsole
|
||||
from ocrmypdf.exceptions import InputFileError
|
||||
from ocrmypdf.helpers import remove_all_log_handlers
|
||||
|
||||
Queue = Union[multiprocessing.Queue, queue.Queue]
|
||||
|
||||
|
||||
def log_listener(q: Queue):
|
||||
"""Listen to the worker processes and forward the messages to logging
|
||||
|
||||
For simplicity this is a thread rather than a process. Only one process
|
||||
should actually write to sys.stderr or whatever we're using, so if this is
|
||||
made into a process the main application needs to be directed to it.
|
||||
|
||||
See https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes
|
||||
"""
|
||||
|
||||
while True:
|
||||
try:
|
||||
record = q.get()
|
||||
if record is None:
|
||||
break
|
||||
logger = logging.getLogger(record.name)
|
||||
logger.handle(record)
|
||||
except Exception: # pylint: disable=broad-except
|
||||
import traceback # pylint: disable=import-outside-toplevel
|
||||
|
||||
print("Logging problem", file=sys.stderr)
|
||||
traceback.print_exc(file=sys.stderr)
|
||||
|
||||
|
||||
def process_sigbus(*args):
|
||||
raise InputFileError("A worker process lost access to an input file")
|
||||
|
||||
|
||||
def process_init(q: Queue, user_init: Callable[[], None], loglevel):
|
||||
"""Initialize a process pool worker"""
|
||||
|
||||
# Ignore SIGINT (our parent process will kill us gracefully)
|
||||
signal.signal(signal.SIGINT, signal.SIG_IGN)
|
||||
|
||||
# Install SIGBUS handler (so our parent process can abort somewhat gracefully)
|
||||
with suppress(AttributeError): # Windows and Cygwin do not have SIGBUS
|
||||
# Windows and Cygwin do not have pthread_sigmask or SIGBUS
|
||||
signal.signal(signal.SIGBUS, process_sigbus)
|
||||
|
||||
# Remove any log handlers that belong to the parent process
|
||||
root = logging.getLogger()
|
||||
remove_all_log_handlers(root)
|
||||
|
||||
# Set up our single log handler to forward messages to the parent
|
||||
root.setLevel(loglevel)
|
||||
root.addHandler(logging.handlers.QueueHandler(q))
|
||||
|
||||
user_init()
|
||||
return
|
||||
|
||||
|
||||
def thread_init(_queue: Queue, user_init: Callable[[], None], _loglevel):
|
||||
# As a thread, block SIGBUS so the main thread deals with it...
|
||||
with suppress(AttributeError):
|
||||
signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGBUS})
|
||||
|
||||
user_init()
|
||||
return
|
||||
|
||||
|
||||
class StandardExecutor(Executor):
|
||||
def _execute(
|
||||
self,
|
||||
*,
|
||||
use_threads: bool,
|
||||
max_workers: int,
|
||||
tqdm_kwargs: dict,
|
||||
worker_initializer: Callable,
|
||||
task: Callable,
|
||||
task_arguments: Iterable,
|
||||
task_finished: Callable,
|
||||
):
|
||||
if use_threads:
|
||||
log_queue = queue.Queue(-1)
|
||||
pool_class = ThreadPool
|
||||
initializer = thread_init
|
||||
else:
|
||||
log_queue = multiprocessing.Queue(-1)
|
||||
pool_class = ProcessPool
|
||||
initializer = process_init
|
||||
|
||||
# Regardless of whether we use_threads for worker processes, the log_listener
|
||||
# must be a thread. Make sure we create the listener after the worker pool,
|
||||
# so that it does not get forked into the workers.
|
||||
listener = threading.Thread(target=log_listener, args=(log_queue,))
|
||||
listener.start()
|
||||
|
||||
with self.pbar_class(**tqdm_kwargs) as pbar:
|
||||
pool = pool_class(
|
||||
processes=max_workers,
|
||||
initializer=initializer,
|
||||
initargs=(log_queue, worker_initializer, logging.getLogger("").level),
|
||||
)
|
||||
try:
|
||||
results = pool.imap_unordered(task, task_arguments)
|
||||
for result in results:
|
||||
if task_finished:
|
||||
task_finished(result, pbar)
|
||||
else:
|
||||
pbar.update()
|
||||
except KeyboardInterrupt:
|
||||
# Terminate pool so we exit instantly
|
||||
pool.terminate()
|
||||
# Don't try listener.join() here, will deadlock
|
||||
raise
|
||||
except Exception:
|
||||
if not os.environ.get("PYTEST_CURRENT_TEST", ""):
|
||||
# Unless inside pytest, exit immediately because no one wants
|
||||
# to wait for child processes to finalize results that will be
|
||||
# thrown away. Inside pytest, we want child processes to exit
|
||||
# cleanly so that they output an error messages or coverage data
|
||||
# we need from them.
|
||||
pool.terminate()
|
||||
raise
|
||||
finally:
|
||||
# Terminate log listener
|
||||
log_queue.put_nowait(None)
|
||||
pool.close()
|
||||
pool.join()
|
||||
|
||||
listener.join()
|
||||
|
||||
|
||||
@hookimpl
|
||||
def get_executor(progressbar_class):
|
||||
return StandardExecutor(pbar_class=progressbar_class)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def get_progressbar_class():
|
||||
return tqdm
|
||||
|
||||
|
||||
@hookimpl
|
||||
def get_logging_console():
|
||||
return logging.StreamHandler(stream=TqdmConsole(sys.stderr))
|
||||
14
src/ocrmypdf/builtin_plugins/default_filters.py
Normal file
14
src/ocrmypdf/builtin_plugins/default_filters.py
Normal file
@@ -0,0 +1,14 @@
|
||||
# © 2021 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
from ocrmypdf import hookimpl
|
||||
|
||||
|
||||
@hookimpl
|
||||
def filter_pdf_page(
|
||||
page, image_filename, output_pdf
|
||||
): # pylint: disable=unused-argument
|
||||
return output_pdf
|
||||
99
src/ocrmypdf/builtin_plugins/ghostscript.py
Normal file
99
src/ocrmypdf/builtin_plugins/ghostscript.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import logging
|
||||
|
||||
from ocrmypdf import hookimpl
|
||||
from ocrmypdf._exec import ghostscript
|
||||
from ocrmypdf._validation import HOCR_OK_LANGS
|
||||
from ocrmypdf.exceptions import MissingDependencyError
|
||||
from ocrmypdf.subprocess import check_external_program
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def check_options(options):
|
||||
gs_version = ghostscript.version()
|
||||
check_external_program(
|
||||
program='gs',
|
||||
package='ghostscript',
|
||||
version_checker=gs_version,
|
||||
need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports
|
||||
)
|
||||
if gs_version in ('9.24', '9.51'):
|
||||
raise MissingDependencyError(
|
||||
f"Ghostscript {gs_version} contains serious regressions and is not "
|
||||
"supported. Please upgrade to a newer version, or downgrade to the "
|
||||
"previous version."
|
||||
)
|
||||
|
||||
# We have these constraints to check for.
|
||||
# 1. Ghostscript < 9.20 mangles multibyte Unicode
|
||||
# 2. hocr doesn't work on non-Latin languages (so don't select it)
|
||||
is_latin = options.languages.issubset(HOCR_OK_LANGS)
|
||||
if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin:
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=696874
|
||||
# Ghostscript < 9.20 fails to encode multibyte characters properly
|
||||
log.warning(
|
||||
f"The installed version of Ghostscript ({gs_version}) does not work "
|
||||
"correctly with the OCR languages you specified. Use --output-type pdf or "
|
||||
"upgrade to Ghostscript 9.20 or later to avoid this issue."
|
||||
)
|
||||
|
||||
if options.output_type == 'pdfa':
|
||||
options.output_type = 'pdfa-2'
|
||||
|
||||
if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
|
||||
raise MissingDependencyError(
|
||||
"--output-type pdfa-3 requires Ghostscript 9.19 or later"
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def rasterize_pdf_page(
|
||||
input_file,
|
||||
output_file,
|
||||
raster_device,
|
||||
raster_dpi,
|
||||
pageno,
|
||||
page_dpi,
|
||||
rotation,
|
||||
filter_vector,
|
||||
):
|
||||
ghostscript.rasterize_pdf(
|
||||
input_file,
|
||||
output_file,
|
||||
raster_device=raster_device,
|
||||
raster_dpi=raster_dpi,
|
||||
pageno=pageno,
|
||||
page_dpi=page_dpi,
|
||||
rotation=rotation,
|
||||
filter_vector=filter_vector,
|
||||
)
|
||||
return output_file
|
||||
|
||||
|
||||
@hookimpl
|
||||
def generate_pdfa(
|
||||
pdf_pages,
|
||||
pdfmark,
|
||||
output_file,
|
||||
compression,
|
||||
pdf_version,
|
||||
pdfa_part,
|
||||
progressbar_class,
|
||||
):
|
||||
ghostscript.generate_pdfa(
|
||||
pdf_pages=[*pdf_pages, pdfmark],
|
||||
output_file=output_file,
|
||||
compression=compression,
|
||||
pdf_version=pdf_version,
|
||||
pdfa_part=pdfa_part,
|
||||
progressbar_class=progressbar_class,
|
||||
)
|
||||
return output_file
|
||||
179
src/ocrmypdf/builtin_plugins/tesseract_ocr.py
Normal file
179
src/ocrmypdf/builtin_plugins/tesseract_ocr.py
Normal file
@@ -0,0 +1,179 @@
|
||||
# © 2020 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from ocrmypdf import hookimpl
|
||||
from ocrmypdf._exec import tesseract
|
||||
from ocrmypdf.cli import numeric
|
||||
from ocrmypdf.exceptions import MissingDependencyError
|
||||
from ocrmypdf.helpers import clamp
|
||||
from ocrmypdf.pluginspec import OcrEngine
|
||||
from ocrmypdf.subprocess import check_external_program
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def add_options(parser):
|
||||
tess = parser.add_argument_group("Tesseract", "Advanced control of Tesseract OCR")
|
||||
tess.add_argument(
|
||||
'--tesseract-config',
|
||||
action='append',
|
||||
metavar='CFG',
|
||||
default=[],
|
||||
help="Additional Tesseract configuration files -- see documentation",
|
||||
)
|
||||
tess.add_argument(
|
||||
'--tesseract-pagesegmode',
|
||||
action='store',
|
||||
type=int,
|
||||
metavar='PSM',
|
||||
choices=range(0, 14),
|
||||
help="Set Tesseract page segmentation mode (see tesseract --help)",
|
||||
)
|
||||
tess.add_argument(
|
||||
'--tesseract-oem',
|
||||
action='store',
|
||||
type=int,
|
||||
metavar='MODE',
|
||||
choices=range(0, 4),
|
||||
help=(
|
||||
"Set Tesseract 4.0 OCR engine mode: "
|
||||
"0 - original Tesseract only; "
|
||||
"1 - neural nets LSTM only; "
|
||||
"2 - Tesseract + LSTM; "
|
||||
"3 - default."
|
||||
),
|
||||
)
|
||||
tess.add_argument(
|
||||
'--tesseract-timeout',
|
||||
default=180.0,
|
||||
type=numeric(float, 0),
|
||||
metavar='SECONDS',
|
||||
help='Give up on OCR after the timeout, but copy the preprocessed page '
|
||||
'into the final output',
|
||||
)
|
||||
tess.add_argument(
|
||||
'--user-words',
|
||||
metavar='FILE',
|
||||
help="Specify the location of the Tesseract user words file. This is a "
|
||||
"list of words Tesseract should consider while performing OCR in "
|
||||
"addition to its standard language dictionaries. This can improve "
|
||||
"OCR quality especially for specialized and technical documents.",
|
||||
)
|
||||
tess.add_argument(
|
||||
'--user-patterns',
|
||||
metavar='FILE',
|
||||
help="Specify the location of the Tesseract user patterns file.",
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def check_options(options):
|
||||
check_external_program(
|
||||
program='tesseract',
|
||||
package={'linux': 'tesseract-ocr'},
|
||||
version_checker=tesseract.version,
|
||||
need_version='4.0.0-beta.1', # using backport for Travis CI
|
||||
version_parser=tesseract.TesseractVersion,
|
||||
)
|
||||
|
||||
# Decide on what renderer to use
|
||||
if options.pdf_renderer == 'auto':
|
||||
options.pdf_renderer = 'sandwich'
|
||||
|
||||
if not tesseract.has_user_words() and (options.user_words or options.user_patterns):
|
||||
log.warning(
|
||||
"Tesseract 4.0 ignores --user-words and --user-patterns, so these "
|
||||
"arguments have no effect."
|
||||
)
|
||||
if options.tesseract_pagesegmode in (0, 2):
|
||||
log.warning(
|
||||
"The --tesseract-pagesegmode argument you select will disable OCR. "
|
||||
"This may cause processing to fail."
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def validate(pdfinfo, options):
|
||||
# Tesseract 4.x can be multithreaded, and we also run multiple workers. We want
|
||||
# to manage how many threads it uses to avoid creating total threads than cores.
|
||||
# Performance testing shows we're better off
|
||||
# parallelizing ocrmypdf and forcing Tesseract to be single threaded, which we
|
||||
# get by setting the envvar OMP_THREAD_LIMIT to 1. But if the page count of the
|
||||
# input file is small, then we allow Tesseract to use threads, subject to the
|
||||
# constraint: (ocrmypdf workers) * (tesseract threads) <= max_workers.
|
||||
# As of Tesseract 4.1, 3 threads is the most effective on a 4 core/8 thread system.
|
||||
if not os.environ.get('OMP_THREAD_LIMIT', '').isnumeric():
|
||||
tess_threads = clamp(options.jobs // len(pdfinfo), 1, 3)
|
||||
os.environ['OMP_THREAD_LIMIT'] = str(tess_threads)
|
||||
else:
|
||||
tess_threads = int(os.environ['OMP_THREAD_LIMIT'])
|
||||
log.debug("Using Tesseract OpenMP thread limit %d", tess_threads)
|
||||
|
||||
|
||||
class TesseractOcrEngine(OcrEngine):
|
||||
@staticmethod
|
||||
def version():
|
||||
return tesseract.version()
|
||||
|
||||
@staticmethod
|
||||
def creator_tag(options):
|
||||
tag = '-PDF' if options.pdf_renderer == 'sandwich' else ''
|
||||
return f"Tesseract OCR{tag} {TesseractOcrEngine.version()}"
|
||||
|
||||
def __str__(self):
|
||||
return f"Tesseract OCR {TesseractOcrEngine.version()}"
|
||||
|
||||
@staticmethod
|
||||
def languages(options):
|
||||
return tesseract.get_languages()
|
||||
|
||||
@staticmethod
|
||||
def get_orientation(input_file, options):
|
||||
return tesseract.get_orientation(
|
||||
input_file,
|
||||
engine_mode=options.tesseract_oem,
|
||||
timeout=options.tesseract_timeout,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def generate_hocr(input_file, output_hocr, output_text, options):
|
||||
tesseract.generate_hocr(
|
||||
input_file=input_file,
|
||||
output_hocr=output_hocr,
|
||||
output_text=output_text,
|
||||
languages=options.languages,
|
||||
engine_mode=options.tesseract_oem,
|
||||
tessconfig=options.tesseract_config,
|
||||
timeout=options.tesseract_timeout,
|
||||
pagesegmode=options.tesseract_pagesegmode,
|
||||
user_words=options.user_words,
|
||||
user_patterns=options.user_patterns,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def generate_pdf(input_file, output_pdf, output_text, options):
|
||||
tesseract.generate_pdf(
|
||||
input_file=input_file,
|
||||
output_pdf=output_pdf,
|
||||
output_text=output_text,
|
||||
languages=options.languages,
|
||||
engine_mode=options.tesseract_oem,
|
||||
tessconfig=options.tesseract_config,
|
||||
timeout=options.tesseract_timeout,
|
||||
pagesegmode=options.tesseract_pagesegmode,
|
||||
user_words=options.user_words,
|
||||
user_patterns=options.user_patterns,
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def get_ocr_engine():
|
||||
return TesseractOcrEngine()
|
||||
486
src/ocrmypdf/cli.py
Normal file
486
src/ocrmypdf/cli.py
Normal file
@@ -0,0 +1,486 @@
|
||||
# © 2015-19 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import argparse
|
||||
from typing import Optional, Type, TypeVar
|
||||
|
||||
from ocrmypdf._version import PROGRAM_NAME as _PROGRAM_NAME
|
||||
from ocrmypdf._version import __version__ as _VERSION
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
def numeric(basetype: Type[T], min_: Optional[T] = None, max_: Optional[T] = None):
|
||||
"""Validator for numeric params"""
|
||||
min_ = basetype(min_) if min_ is not None else None
|
||||
max_ = basetype(max_) if max_ is not None else None
|
||||
|
||||
def _numeric(string):
|
||||
value = basetype(string)
|
||||
if (min_ is not None and value < min_) or (max_ is not None and value > max_):
|
||||
msg = "%r not in valid range %r" % (string, (min_, max_))
|
||||
raise argparse.ArgumentTypeError(msg)
|
||||
return value
|
||||
|
||||
_numeric.__name__ = basetype.__name__
|
||||
return _numeric
|
||||
|
||||
|
||||
class ArgumentParser(argparse.ArgumentParser):
|
||||
"""Override parser's default behavior of calling sys.exit()
|
||||
|
||||
https://stackoverflow.com/questions/5943249/python-argparse-and-controlling-overriding-the-exit-status-code
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._api_mode = False
|
||||
|
||||
def error(self, message):
|
||||
if not self._api_mode:
|
||||
super().error(message)
|
||||
return
|
||||
raise ValueError(message)
|
||||
|
||||
|
||||
class LanguageSetAction(argparse.Action):
|
||||
def __init__(self, option_strings, dest, default=None, **kwargs):
|
||||
if default is None:
|
||||
default = set()
|
||||
super().__init__(option_strings, dest, default=default, **kwargs)
|
||||
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
dest = getattr(namespace, self.dest)
|
||||
if '+' in values:
|
||||
dest.update(lang for lang in values.split('+'))
|
||||
else:
|
||||
dest.add(values)
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = ArgumentParser(
|
||||
prog=_PROGRAM_NAME,
|
||||
allow_abbrev=True,
|
||||
fromfile_prefix_chars='@',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description="""\
|
||||
Generates a searchable PDF or PDF/A from a regular PDF.
|
||||
|
||||
OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
|
||||
rotation and performs image processing, runs the Tesseract OCR engine on the
|
||||
image, and then creates a PDF from the OCR information.
|
||||
""",
|
||||
epilog="""\
|
||||
OCRmyPDF attempts to keep the output file at about the same size. If a file
|
||||
contains losslessly compressed images, and images in the output file will be
|
||||
losslessly compressed as well.
|
||||
|
||||
PDF is a page description file that attempts to preserve a layout exactly.
|
||||
A PDF can contain vector objects (such as text or lines) and raster objects
|
||||
(images). A page might have multiple images. OCRmyPDF is prepared to deal
|
||||
with the wide variety of PDFs that exist in the wild.
|
||||
|
||||
When a PDF page contains text, OCRmyPDF assumes that the page has already
|
||||
been OCRed or is a "born digital" page that should not be OCRed. The default
|
||||
behavior is to exit in this case without producing a file. You can use the
|
||||
option --skip-text to ignore pages with text, or --force-ocr to rasterize
|
||||
all objects on the page and produce an image-only PDF as output.
|
||||
|
||||
ocrmypdf --skip-text file_with_some_text_pages.pdf output.pdf
|
||||
|
||||
ocrmypdf --force-ocr word_document.pdf output.pdf
|
||||
|
||||
If you are concerned about long-term archiving of PDFs, use the default option
|
||||
--output-type pdfa which converts the PDF to a standardized PDF/A-2b. This
|
||||
removes some features from the PDF such as Javascript or forms. If you want to
|
||||
minimize the number of changes made to your PDF, use --output-type pdf.
|
||||
|
||||
If OCRmyPDF is given an image file as input, it will attempt to convert the
|
||||
image to a PDF before processing. For more control over the conversion of
|
||||
images to PDF, use the Python package img2pdf or other image to PDF software.
|
||||
|
||||
For example, this command uses img2pdf to convert all .png files beginning
|
||||
with the 'page' prefix to a PDF, fitting each image on A4-sized paper, and
|
||||
sending the result to OCRmyPDF through a pipe.
|
||||
|
||||
img2pdf --pagesize A4 page*.png | ocrmypdf - myfile.pdf
|
||||
|
||||
Online documentation is located at:
|
||||
https://ocrmypdf.readthedocs.io/en/latest/introduction.html
|
||||
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'input_file',
|
||||
metavar="input_pdf_or_image",
|
||||
help="PDF file containing the images to be OCRed (or '-' to read from "
|
||||
"standard input)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'output_file',
|
||||
metavar="output_pdf",
|
||||
help="Output searchable PDF file (or '-' to write to standard output). "
|
||||
"Existing files will be ovewritten. If same as input file, the "
|
||||
"input file will be updated only if processing is successful.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'-l',
|
||||
'--language',
|
||||
dest='languages',
|
||||
action=LanguageSetAction,
|
||||
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
|
||||
"all language packs installed in your system). Use -l eng+deu for "
|
||||
"multiple languages.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--image-dpi',
|
||||
metavar='DPI',
|
||||
type=int,
|
||||
help="For input image instead of PDF, use this DPI instead of file's.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-type',
|
||||
choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
|
||||
default='pdfa',
|
||||
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
|
||||
"long term archiving (default, recommended) but may not suitable "
|
||||
"for users who want their file altered as little as possible. 'pdfa' "
|
||||
"also has problems with full Unicode text. 'pdf' attempts to "
|
||||
"preserve file contents as much as possible. 'pdf-a1' creates a "
|
||||
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
|
||||
"PDF/A3-b file.",
|
||||
)
|
||||
|
||||
# Use null string '\0' as sentinel to indicate the user supplied no argument,
|
||||
# since that is the only invalid character for filepaths on all platforms
|
||||
# bool('\0') is True in Python
|
||||
parser.add_argument(
|
||||
'--sidecar',
|
||||
nargs='?',
|
||||
const='\0',
|
||||
default=None,
|
||||
metavar='FILE',
|
||||
help="Generate sidecar text files that contain the same text recognized "
|
||||
"by Tesseract. This may be useful for building a OCR text database. "
|
||||
"If FILE is omitted, the sidecar file be named {output_file}.txt; the next "
|
||||
"argument must NOT be the name of the input PDF. "
|
||||
"If FILE is set to '-', the sidecar is written to stdout (a "
|
||||
"convenient way to preview OCR quality). The output file and sidecar "
|
||||
"may not both use stdout at the same time.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--version',
|
||||
action='version',
|
||||
version=_VERSION,
|
||||
help="Print program version and exit",
|
||||
)
|
||||
|
||||
jobcontrol = parser.add_argument_group("Job control options")
|
||||
jobcontrol.add_argument(
|
||||
'-j',
|
||||
'--jobs',
|
||||
metavar='N',
|
||||
type=numeric(int, 0, 256),
|
||||
help="Use up to N CPU cores simultaneously (default: use all).",
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'-q', '--quiet', action='store_true', help="Suppress INFO messages"
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'-v',
|
||||
'--verbose',
|
||||
type=numeric(int, 0, 2),
|
||||
default=0,
|
||||
const=1,
|
||||
nargs='?',
|
||||
help="Print more verbose messages for each additional verbose level. Use "
|
||||
"`-v 1` typically for much more detailed logging. Higher numbers "
|
||||
"are probably only useful in debugging.",
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'--no-progress-bar',
|
||||
action='store_false',
|
||||
dest='progress_bar',
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'--use-threads', action='store_true', help=argparse.SUPPRESS
|
||||
)
|
||||
|
||||
metadata = parser.add_argument_group(
|
||||
"Metadata options",
|
||||
"Set output PDF/A metadata (default: copy input document's metadata)",
|
||||
)
|
||||
metadata.add_argument(
|
||||
'--title', type=str, help="Set document title (place multiple words in quotes)"
|
||||
)
|
||||
metadata.add_argument('--author', type=str, help="Set document author")
|
||||
metadata.add_argument(
|
||||
'--subject', type=str, help="Set document subject description"
|
||||
)
|
||||
metadata.add_argument('--keywords', type=str, help="Set document keywords")
|
||||
|
||||
preprocessing = parser.add_argument_group(
|
||||
"Image preprocessing options",
|
||||
"Options to improve the quality of the final PDF and OCR",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-r',
|
||||
'--rotate-pages',
|
||||
action='store_true',
|
||||
help="Automatically rotate pages based on detected text orientation",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--remove-background',
|
||||
action='store_true',
|
||||
help="Attempt to remove background from gray or color pages, setting it "
|
||||
"to white ",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-d',
|
||||
'--deskew',
|
||||
action='store_true',
|
||||
help="Deskew each page before performing OCR",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-c',
|
||||
'--clean',
|
||||
action='store_true',
|
||||
help="Clean pages from scanning artifacts before performing OCR, and send "
|
||||
"the cleaned page to OCR, but do not include the cleaned page in "
|
||||
"the output",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-i',
|
||||
'--clean-final',
|
||||
action='store_true',
|
||||
help="Clean page as above, and incorporate the cleaned image in the final "
|
||||
"PDF. Might remove desired content.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--unpaper-args',
|
||||
type=str,
|
||||
default=None,
|
||||
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
|
||||
"Example: --unpaper-args '--layout double'.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--oversample',
|
||||
metavar='DPI',
|
||||
type=numeric(int, 0, 5000),
|
||||
default=0,
|
||||
help="Oversample images to at least the specified DPI, to improve OCR "
|
||||
"results slightly",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--remove-vectors',
|
||||
action='store_true',
|
||||
help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
|
||||
"will not be included in OCR. This can eliminate false characters.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--threshold',
|
||||
action='store_true',
|
||||
help=(
|
||||
"EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract "
|
||||
"for OCR. Can improve OCR quality compared to Tesseract's thresholder."
|
||||
),
|
||||
)
|
||||
|
||||
ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
|
||||
ocrsettings.add_argument(
|
||||
'-f',
|
||||
'--force-ocr',
|
||||
action='store_true',
|
||||
help="Rasterize any text or vector objects on each page, apply OCR, and "
|
||||
"save the rastered output (this rewrites the PDF)",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'-s',
|
||||
'--skip-text',
|
||||
action='store_true',
|
||||
help="Skip OCR on any pages that already contain text, but include the "
|
||||
"page in final output; useful for PDFs that contain a mix of "
|
||||
"images, text pages, and/or previously OCRed pages",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'--redo-ocr',
|
||||
action='store_true',
|
||||
help="Attempt to detect and remove the hidden OCR layer from files that "
|
||||
"were previously OCRed with OCRmyPDF or another program. Apply OCR "
|
||||
"to text found in raster images. Existing visible text objects will "
|
||||
"not be changed. If there is no existing OCR, OCR will be added.",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'--skip-big',
|
||||
type=numeric(float, 0, 5000),
|
||||
metavar='MPixels',
|
||||
help="Skip OCR on pages larger than the specified amount of megapixels, "
|
||||
"but include skipped pages in final output",
|
||||
)
|
||||
|
||||
optimizing = parser.add_argument_group(
|
||||
"Optimization options", "Control how the PDF is optimized after OCR"
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'-O',
|
||||
'--optimize',
|
||||
type=int,
|
||||
choices=range(0, 4),
|
||||
default=1,
|
||||
help=(
|
||||
"Control how PDF is optimized after processing:"
|
||||
"0 - do not optimize; "
|
||||
"1 - do safe, lossless optimizations (default); "
|
||||
"2 - do some lossy optimizations; "
|
||||
"3 - do aggressive lossy optimizations (including lossy JBIG2)"
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jpeg-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
help=(
|
||||
"Adjust JPEG quality level for JPEG optimization. "
|
||||
"100 is best quality and largest output size; "
|
||||
"1 is lowest quality and smallest output; "
|
||||
"0 uses the default."
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jpg-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
dest='jpeg_quality',
|
||||
help=argparse.SUPPRESS, # Alias for --jpeg-quality
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--png-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
help=(
|
||||
"Adjust PNG quality level to use when quantizing PNGs. "
|
||||
"Values have same meaning as with --jpeg-quality"
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jbig2-lossy',
|
||||
action='store_true',
|
||||
help=(
|
||||
"Enable JBIG2 lossy mode (better compression, not suitable for some "
|
||||
"use cases - see documentation)."
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jbig2-page-group-size',
|
||||
type=numeric(int, 1, 10000),
|
||||
default=0,
|
||||
metavar='N',
|
||||
# Adjust number of pages to consider at once for JBIG2 compression
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
|
||||
advanced = parser.add_argument_group(
|
||||
"Advanced", "Advanced options to control OCRmyPDF"
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pages',
|
||||
type=str,
|
||||
help=(
|
||||
"Limit OCR to the specified pages (ranges or comma separated), "
|
||||
"skipping others"
|
||||
),
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--max-image-mpixels',
|
||||
action='store',
|
||||
type=numeric(float, 0),
|
||||
metavar='MPixels',
|
||||
help="Set maximum number of pixels to unpack before treating an image as a "
|
||||
"decompression bomb",
|
||||
default=128.0,
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdf-renderer',
|
||||
choices=['auto', 'hocr', 'sandwich', 'hocrdebug'],
|
||||
default='auto',
|
||||
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
|
||||
"choose. See documentation for discussion.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--rotate-pages-threshold',
|
||||
default=14.0,
|
||||
type=numeric(float, 0, 1000),
|
||||
metavar='CONFIDENCE',
|
||||
help="Only rotate pages when confidence is above this value (arbitrary "
|
||||
"units reported by tesseract)",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdfa-image-compression',
|
||||
choices=['auto', 'jpeg', 'lossless'],
|
||||
default='auto',
|
||||
help="Specify how to compress images in the output PDF/A. 'auto' lets "
|
||||
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
|
||||
"JPEG compression. 'lossless' uses PNG-style lossless compression "
|
||||
"for all images. Monochrome images are always compressed using a "
|
||||
"lossless codec. Compression settings "
|
||||
"are applied to all pages, including those for which OCR was "
|
||||
"skipped. Not supported for --output-type=pdf ; that setting "
|
||||
"preserves the original compression of all images.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--fast-web-view',
|
||||
type=numeric(float, 0),
|
||||
default=1.0,
|
||||
metavar="MEGABYTES",
|
||||
help="If the size of file is more than this threshold (in MB), then "
|
||||
"linearize the PDF for fast web viewing. This allows the PDF to be "
|
||||
"displayed before it is fully downloaded in web browsers, but increases "
|
||||
"the space required slightly. By default we skip this for small files "
|
||||
"which do not benefit. If the threshold is 0 it will be apply to all files. "
|
||||
"Set the threshold very high to disable.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--plugin',
|
||||
dest='plugins',
|
||||
action='append',
|
||||
default=[],
|
||||
help="Name of plugin to import. Argument may be issued multiple times to "
|
||||
"import multiple plugins. Plugins may be specified as module names in "
|
||||
"Python syntax, provided they are installed in the same Python (virtual) "
|
||||
"environment as ocrmypdf; or you may give the path to the Python file that "
|
||||
"contains the plugin. Plugins must conform to the specification in the "
|
||||
"OCRmyPDF documentation.",
|
||||
)
|
||||
|
||||
debugging = parser.add_argument_group(
|
||||
"Debugging", "Arguments to help with troubleshooting and debugging"
|
||||
)
|
||||
debugging.add_argument(
|
||||
'-k',
|
||||
'--keep-temporary-files',
|
||||
action='store_true',
|
||||
help="Keep temporary files (helpful for debugging)",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
plugins_only_parser = ArgumentParser(
|
||||
prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False
|
||||
)
|
||||
plugins_only_parser.add_argument(
|
||||
'--plugin',
|
||||
dest='plugins',
|
||||
action='append',
|
||||
default=[],
|
||||
help="Name of plugin to import.",
|
||||
)
|
||||
@@ -1,19 +1,8 @@
|
||||
# © 2016 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
from enum import IntEnum
|
||||
|
||||
@@ -1,173 +0,0 @@
|
||||
# © 2016 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
"""Wrappers to manage subprocess calls"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from subprocess import run, STDOUT, PIPE, CalledProcessError
|
||||
from ..exceptions import MissingDependencyError, ExitCode
|
||||
from collections.abc import Mapping
|
||||
|
||||
|
||||
def get_version(program, *, version_arg='--version', regex=r'(\d+(\.\d+)*)'):
|
||||
"Get the version of the specified program"
|
||||
args_prog = [program, version_arg]
|
||||
try:
|
||||
proc = run(
|
||||
args_prog,
|
||||
close_fds=True,
|
||||
universal_newlines=True,
|
||||
stdout=PIPE,
|
||||
stderr=STDOUT,
|
||||
check=True,
|
||||
)
|
||||
output = proc.stdout
|
||||
except FileNotFoundError as e:
|
||||
raise MissingDependencyError(
|
||||
f"Could not find program '{program}' on the PATH"
|
||||
) from e
|
||||
except CalledProcessError as e:
|
||||
if e.returncode != 0:
|
||||
raise MissingDependencyError(
|
||||
f"Ran program '{program}' but it exited with an error:\n{e.output}"
|
||||
) from e
|
||||
raise MissingDependencyError(
|
||||
f"Could not find program '{program}' on the PATH"
|
||||
) from e
|
||||
try:
|
||||
version = re.match(regex, output.strip()).group(1)
|
||||
except AttributeError as e:
|
||||
raise MissingDependencyError(
|
||||
f"The program '{program}' did not report its version. "
|
||||
f"Message was:\n{output}"
|
||||
)
|
||||
|
||||
return version
|
||||
|
||||
|
||||
missing_program = '''
|
||||
The program '{program}' could not be executed or was not found on your
|
||||
system PATH.
|
||||
'''
|
||||
|
||||
missing_optional_program = '''
|
||||
The program '{program}' could not be executed or was not found on your
|
||||
system PATH. This program is required when you use the
|
||||
{required_for} arguments. You could try omitting these arguments, or install
|
||||
the package.
|
||||
'''
|
||||
|
||||
missing_recommend_program = '''
|
||||
The program '{program}' could not be executed or was not found on your
|
||||
system PATH. This program is recommended when using the {required_for} arguments,
|
||||
but not required, so we will proceed. For best results, install the program.
|
||||
'''
|
||||
|
||||
old_version = '''
|
||||
OCRmyPDF requires '{program}' {need_version} or higher. Your system appears
|
||||
to have {found_version}. Please update this program.
|
||||
'''
|
||||
|
||||
old_version_required_for = '''
|
||||
OCRmyPDF requires '{program}' {need_version} or higher when run with the
|
||||
{required_for} arguments. If you omit these arguments, OCRmyPDF may be able to
|
||||
proceed. For best results, install the program.
|
||||
'''
|
||||
|
||||
osx_install_advice = '''
|
||||
If you have homebrew installed, try these command to install the missing
|
||||
package:
|
||||
brew install {package}
|
||||
'''
|
||||
|
||||
linux_install_advice = '''
|
||||
On systems with the aptitude package manager (Debian, Ubuntu), try these
|
||||
commands:
|
||||
sudo apt-get update
|
||||
sudo apt-get install {package}
|
||||
|
||||
On RPM-based systems (Red Hat, Fedora), search for instructions on
|
||||
installing the RPM for {program}.
|
||||
'''
|
||||
|
||||
|
||||
def _get_platform():
|
||||
if sys.platform.startswith('freebsd'):
|
||||
return 'freebsd'
|
||||
elif sys.platform.startswith('linux'):
|
||||
return 'linux'
|
||||
return sys.platform
|
||||
|
||||
|
||||
def _error_trailer(log, program, package, **kwargs):
|
||||
if isinstance(package, Mapping):
|
||||
package = package[_get_platform()]
|
||||
|
||||
if _get_platform() == 'darwin':
|
||||
log.info(osx_install_advice.format(**locals()))
|
||||
elif _get_platform() == 'linux':
|
||||
log.info(linux_install_advice.format(**locals()))
|
||||
|
||||
|
||||
def _error_missing_program(log, program, package, required_for, recommended):
|
||||
if required_for:
|
||||
log.error(missing_optional_program.format(**locals()))
|
||||
elif recommended:
|
||||
log.info(missing_recommend_program.format(**locals()))
|
||||
else:
|
||||
log.error(missing_program.format(**locals()))
|
||||
_error_trailer(**locals())
|
||||
|
||||
|
||||
def _error_old_version(
|
||||
log, program, package, need_version, found_version, required_for
|
||||
):
|
||||
if required_for:
|
||||
log.error(old_version_required_for.format(**locals()))
|
||||
else:
|
||||
log.error(old_version.format(**locals()))
|
||||
_error_trailer(**locals())
|
||||
|
||||
|
||||
def check_external_program(
|
||||
*,
|
||||
log,
|
||||
program,
|
||||
package,
|
||||
version_checker,
|
||||
need_version,
|
||||
required_for=None,
|
||||
recommended=False,
|
||||
):
|
||||
try:
|
||||
found_version = version_checker()
|
||||
except (CalledProcessError, FileNotFoundError, MissingDependencyError):
|
||||
_error_missing_program(log, program, package, required_for, recommended)
|
||||
if not recommended:
|
||||
sys.exit(ExitCode.missing_dependency)
|
||||
return
|
||||
|
||||
if found_version < need_version:
|
||||
_error_old_version(
|
||||
log, program, package, need_version, found_version, required_for
|
||||
)
|
||||
if not recommended:
|
||||
sys.exit(ExitCode.missing_dependency)
|
||||
|
||||
log.debug(f'Found {program} {found_version}')
|
||||
@@ -1,291 +0,0 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from os import fspath
|
||||
from shutil import copy
|
||||
from subprocess import PIPE, STDOUT, run
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from . import get_version
|
||||
from ..exceptions import SubprocessOutputError
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def version():
|
||||
return get_version('gs')
|
||||
|
||||
|
||||
def jpeg_passthrough_available():
|
||||
"""Returns True if the installed version of Ghostscript supports JPEG passthru
|
||||
|
||||
Prior to 9.23, Ghostscript decode and re-encoded JPEGs internally. In 9.23
|
||||
it gained the ability to keep JPEGs unmodified. However, the 9.23
|
||||
implementation was buggy and would deletes the last two bytes of images in
|
||||
some cases, as reported here.
|
||||
https://bugs.ghostscript.com/show_bug.cgi?id=699216
|
||||
|
||||
The issue was fixed for 9.24, hence that is the first version we consider
|
||||
the feature available. (However, we don't use 9.24 at all, so the first
|
||||
version that allows JPEG passthrough is 9.25.
|
||||
|
||||
"""
|
||||
return version() >= '9.24'
|
||||
|
||||
|
||||
def _gs_error_reported(stream):
|
||||
return re.search(r'error', stream, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def extract_text(input_file, pageno=1):
|
||||
"""Use the txtwrite device to get text layout information out
|
||||
|
||||
For details on options of -dTextFormat see
|
||||
https://www.ghostscript.com/doc/current/VectorDevices.htm#TXT
|
||||
|
||||
Format is like
|
||||
<page>
|
||||
<line>
|
||||
<span bbox="left top right bottom" font="..." size="...">
|
||||
<char bbox="...." c="X"/>
|
||||
|
||||
:param pageno: number of page to extract, or all pages if None
|
||||
:return: XML-ish text representation in bytes
|
||||
"""
|
||||
|
||||
if pageno is not None:
|
||||
pages = ['-dFirstPage=%i' % pageno, '-dLastPage=%i' % pageno]
|
||||
else:
|
||||
pages = []
|
||||
|
||||
args_gs = (
|
||||
[
|
||||
'gs',
|
||||
'-dQUIET',
|
||||
'-dSAFER',
|
||||
'-dBATCH',
|
||||
'-dNOPAUSE',
|
||||
'-sDEVICE=txtwrite',
|
||||
'-dTextFormat=0',
|
||||
]
|
||||
+ pages
|
||||
+ ['-o', '-', fspath(input_file)]
|
||||
)
|
||||
|
||||
p = run(args_gs, stdout=PIPE, stderr=PIPE)
|
||||
if p.returncode != 0:
|
||||
raise SubprocessOutputError(
|
||||
'Ghostscript text extraction failed\n%s\n%s\n%s'
|
||||
% (input_file, p.stdout.decode(), p.stderr.decode())
|
||||
)
|
||||
|
||||
return p.stdout
|
||||
|
||||
|
||||
def rasterize_pdf(
|
||||
input_file,
|
||||
output_file,
|
||||
xres,
|
||||
yres,
|
||||
raster_device,
|
||||
log,
|
||||
pageno=1,
|
||||
page_dpi=None,
|
||||
rotation=None,
|
||||
filter_vector=False,
|
||||
):
|
||||
"""Rasterize one page of a PDF at resolution (xres, yres) in canvas units.
|
||||
|
||||
The image is sized to match the integer pixels dimensions implied by
|
||||
(xres, yres) even if those numbers are noninteger. The image's DPI will
|
||||
be overridden with the values in page_dpi.
|
||||
|
||||
:param input_file: pathlike
|
||||
:param output_file: pathlike
|
||||
:param xres: resolution at which to rasterize page
|
||||
:param yres:
|
||||
:param raster_device:
|
||||
:param log:
|
||||
:param pageno: page number to rasterize (beginning at page 1)
|
||||
:param page_dpi: resolution tuple (x, y) overriding output image DPI
|
||||
:param rotation: 0, 90, 180, 270: clockwise angle to rotate page
|
||||
:param filter_vector: if True, remove vector graphics objects
|
||||
:return:
|
||||
"""
|
||||
res = round(xres, 6), round(yres, 6)
|
||||
if not page_dpi:
|
||||
page_dpi = res
|
||||
|
||||
with NamedTemporaryFile(delete=True) as tmp:
|
||||
args_gs = (
|
||||
[
|
||||
'gs',
|
||||
'-dQUIET',
|
||||
'-dSAFER',
|
||||
'-dBATCH',
|
||||
'-dNOPAUSE',
|
||||
f'-sDEVICE={raster_device}',
|
||||
f'-dFirstPage={pageno}',
|
||||
f'-dLastPage={pageno}',
|
||||
f'-r{res[0]:f}x{res[1]:f}',
|
||||
]
|
||||
+ (['-dFILTERVECTOR'] if filter_vector else [])
|
||||
+ [
|
||||
'-o',
|
||||
tmp.name,
|
||||
'-dAutoRotatePages=/None', # Probably has no effect on raster
|
||||
'-f',
|
||||
fspath(input_file),
|
||||
]
|
||||
)
|
||||
|
||||
log.debug(args_gs)
|
||||
p = run(args_gs, stdout=PIPE, stderr=STDOUT, universal_newlines=True)
|
||||
if _gs_error_reported(p.stdout):
|
||||
log.error(p.stdout)
|
||||
else:
|
||||
log.debug(p.stdout)
|
||||
|
||||
if p.returncode != 0:
|
||||
log.error('Ghostscript rasterizing failed')
|
||||
raise SubprocessOutputError()
|
||||
|
||||
tmp.seek(0)
|
||||
with Image.open(tmp) as im:
|
||||
if rotation is not None:
|
||||
log.debug("Rotating output by %i", rotation)
|
||||
# rotation is a clockwise angle and Image.ROTATE_* is
|
||||
# counterclockwise so this cancels out the rotation
|
||||
if rotation == 90:
|
||||
im = im.transpose(Image.ROTATE_90)
|
||||
elif rotation == 180:
|
||||
im = im.transpose(Image.ROTATE_180)
|
||||
elif rotation == 270:
|
||||
im = im.transpose(Image.ROTATE_270)
|
||||
if rotation % 180 == 90:
|
||||
page_dpi = page_dpi[1], page_dpi[0]
|
||||
im.save(fspath(output_file), dpi=page_dpi)
|
||||
|
||||
|
||||
def generate_pdfa(
|
||||
pdf_pages,
|
||||
output_file,
|
||||
compression,
|
||||
log,
|
||||
threads=1,
|
||||
pdf_version='1.5',
|
||||
pdfa_part='2',
|
||||
):
|
||||
"""Generate a PDF/A.
|
||||
|
||||
The pdf_pages, a list files, will be merged into output_file. One or more
|
||||
PDF files may be merged. One of the files in this list must be a pdfmark
|
||||
file that provides Ghostscript with details on how to perform the PDF/A
|
||||
conversion. By default with we pick PDF/A-2b, but this works for 1 or 3.
|
||||
|
||||
compression can be 'jpeg', 'lossless', or an empty string. In 'jpeg',
|
||||
Ghostscript is instructed to convert color and grayscale images to DCT
|
||||
(JPEG encoding). In 'lossless' Ghostscript is told to convert images to
|
||||
Flate (lossless/PNG). If the parameter is omitted Ghostscript is left to
|
||||
make its own decisions about how to encode images; it appears to use a
|
||||
heuristic to decide how to encode images. As of Ghostscript 9.25, we
|
||||
support passthrough JPEG which allows Ghostscript to avoid transcoding
|
||||
images entirely. (The feature was added in 9.23 but broken, and the 9.24
|
||||
release of Ghostscript had regressions, so we don't support it until 9.25.)
|
||||
"""
|
||||
compression_args = []
|
||||
if compression == 'jpeg':
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=false",
|
||||
"-dColorImageFilter=/DCTEncode",
|
||||
"-dAutoFilterGrayImages=false",
|
||||
"-dGrayImageFilter=/DCTEncode",
|
||||
]
|
||||
elif compression == 'lossless':
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=false",
|
||||
"-dColorImageFilter=/FlateEncode",
|
||||
"-dAutoFilterGrayImages=false",
|
||||
"-dGrayImageFilter=/FlateEncode",
|
||||
]
|
||||
else:
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=true",
|
||||
"-dAutoFilterGrayImages=true",
|
||||
]
|
||||
|
||||
# Older versions of Ghostscript expect a leading slash in
|
||||
# sColorConversionStrategy, newer ones should not have it. See Ghostscript
|
||||
# git commit fe1c025d.
|
||||
strategy = 'RGB' if version() >= '9.19' else '/RGB'
|
||||
|
||||
if version() == '9.23':
|
||||
# 9.23: new feature JPEG passthrough is broken in some cases, best to
|
||||
# disable it always
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=699216
|
||||
compression_args.append('-dPassThroughJPEGImages=false')
|
||||
|
||||
with NamedTemporaryFile(delete=True) as gs_pdf:
|
||||
# nb no need to specify ProcessColorModel when ColorConversionStrategy
|
||||
# is set; see:
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=699392
|
||||
args_gs = (
|
||||
[
|
||||
"gs",
|
||||
"-dQUIET",
|
||||
"-dBATCH",
|
||||
"-dNOPAUSE",
|
||||
"-dCompatibilityLevel=" + str(pdf_version),
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dAutoRotatePages=/None",
|
||||
"-sColorConversionStrategy=" + strategy,
|
||||
]
|
||||
+ compression_args
|
||||
+ [
|
||||
"-dJPEGQ=95",
|
||||
"-dPDFA=" + pdfa_part,
|
||||
"-dPDFACompatibilityPolicy=1",
|
||||
"-sOutputFile=" + gs_pdf.name,
|
||||
]
|
||||
)
|
||||
args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs
|
||||
log.debug(args_gs)
|
||||
p = run(args_gs, stdout=PIPE, stderr=STDOUT, universal_newlines=True)
|
||||
|
||||
if _gs_error_reported(p.stdout):
|
||||
log.error(p.stdout)
|
||||
elif 'overprint mode not set' in p.stdout:
|
||||
# Unless someone is going to print PDF/A documents on a
|
||||
# magical sRGB printer I can't see the removal of overprinting
|
||||
# being a problem....
|
||||
log.debug(
|
||||
"Ghostscript had to remove PDF 'overprinting' from the "
|
||||
"input file to complete PDF/A conversion. "
|
||||
)
|
||||
else:
|
||||
log.debug(p.stdout)
|
||||
|
||||
if p.returncode == 0:
|
||||
# Ghostscript does not change return code when it fails to create
|
||||
# PDF/A - check PDF/A status elsewhere
|
||||
copy(gs_pdf.name, fspath(output_file))
|
||||
else:
|
||||
log.error('Ghostscript PDF/A rendering failed')
|
||||
raise SubprocessOutputError()
|
||||
@@ -1,70 +0,0 @@
|
||||
# © 2018 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from functools import lru_cache
|
||||
from subprocess import run
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
from PIL import Image
|
||||
|
||||
from . import get_version
|
||||
from ..exceptions import MissingDependencyError
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def version():
|
||||
return get_version('pngquant', regex=r'(\d+(\.\d+)*).*')
|
||||
|
||||
|
||||
def available():
|
||||
try:
|
||||
version()
|
||||
except MissingDependencyError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def quantize(input_file, output_file, quality_min, quality_max):
|
||||
if input_file.endswith('.jpg'):
|
||||
im = Image.open(input_file)
|
||||
with NamedTemporaryFile(suffix='.png') as tmp:
|
||||
im.save(tmp)
|
||||
args = [
|
||||
'pngquant',
|
||||
'--force',
|
||||
'--skip-if-larger',
|
||||
'--output',
|
||||
output_file,
|
||||
'--quality',
|
||||
f'{quality_min}-{quality_max}',
|
||||
'--',
|
||||
tmp.name,
|
||||
]
|
||||
run(args)
|
||||
else:
|
||||
args = [
|
||||
'pngquant',
|
||||
'--force',
|
||||
'--skip-if-larger',
|
||||
'--output',
|
||||
output_file,
|
||||
'--quality',
|
||||
f'{quality_min}-{quality_max}',
|
||||
'--',
|
||||
input_file,
|
||||
]
|
||||
run(args)
|
||||
@@ -1,49 +0,0 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from functools import lru_cache
|
||||
from os import fspath
|
||||
from subprocess import PIPE, STDOUT, CalledProcessError, run
|
||||
|
||||
from . import get_version
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def version():
|
||||
return get_version('qpdf', regex=r'qpdf version (.+)')
|
||||
|
||||
|
||||
def check(input_file, log=None):
|
||||
args_qpdf = ['qpdf', '--check', fspath(input_file)]
|
||||
|
||||
if log is None:
|
||||
import logging as log
|
||||
|
||||
try:
|
||||
run(args_qpdf, stderr=STDOUT, stdout=PIPE, universal_newlines=True, check=True)
|
||||
except CalledProcessError as e:
|
||||
if e.returncode == 2:
|
||||
log.error("%s: not a valid PDF, and could not repair it.", input_file)
|
||||
log.error("Details:")
|
||||
log.error(e.output)
|
||||
elif e.returncode == 3:
|
||||
log.info("qpdf --check returned warnings:")
|
||||
log.info(e.output)
|
||||
else:
|
||||
log.warning(e.output)
|
||||
return False
|
||||
return True
|
||||
@@ -1,361 +0,0 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from collections import namedtuple
|
||||
from contextlib import suppress
|
||||
from functools import lru_cache
|
||||
from os import fspath
|
||||
from subprocess import (
|
||||
PIPE,
|
||||
STDOUT,
|
||||
CalledProcessError,
|
||||
TimeoutExpired,
|
||||
check_output,
|
||||
run,
|
||||
)
|
||||
from textwrap import dedent
|
||||
|
||||
from . import get_version
|
||||
from ..exceptions import MissingDependencyError, TesseractConfigError
|
||||
from ..helpers import page_number
|
||||
|
||||
OrientationConfidence = namedtuple('OrientationConfidence', ('angle', 'confidence'))
|
||||
|
||||
HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 4.0.0' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
|
||||
</head>
|
||||
<body>
|
||||
<div class='ocr_page' id='page_1' title='image "_blank.png"; bbox 0 0 {0} {1}; ppageno 0'>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def version():
|
||||
return get_version('tesseract', regex=r'tesseract\s(.+)')
|
||||
|
||||
|
||||
def v4():
|
||||
"Is this Tesseract v4.0?"
|
||||
return version() >= '4'
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def has_textonly_pdf():
|
||||
"""Does Tesseract have textonly_pdf capability?
|
||||
|
||||
Available in v4.00.00alpha since January 2017. Best to
|
||||
parse the parameter list
|
||||
"""
|
||||
args_tess = ['tesseract', '--print-parameters', 'pdf']
|
||||
params = ''
|
||||
try:
|
||||
params = check_output(args_tess, universal_newlines=True, stderr=STDOUT)
|
||||
except CalledProcessError as e:
|
||||
print("Could not --print-parameters from tesseract", file=sys.stderr)
|
||||
raise MissingDependencyError from e
|
||||
if 'textonly_pdf' in params:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def languages():
|
||||
def lang_error(output):
|
||||
msg = dedent(
|
||||
"""Tesseract failed to report available languages.
|
||||
Output from Tesseract:
|
||||
-----------
|
||||
"""
|
||||
)
|
||||
msg += output
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
args_tess = ['tesseract', '--list-langs']
|
||||
try:
|
||||
proc = run(
|
||||
args_tess, universal_newlines=True, stdout=PIPE, stderr=STDOUT, check=True
|
||||
)
|
||||
output = proc.stdout
|
||||
except CalledProcessError as e:
|
||||
lang_error(e.output)
|
||||
raise MissingDependencyError from e
|
||||
|
||||
header, *rest = output.splitlines()
|
||||
if not header.startswith('List of available languages'):
|
||||
lang_error(output)
|
||||
raise MissingDependencyError
|
||||
return set(lang.strip() for lang in rest)
|
||||
|
||||
|
||||
def tess_base_args(langs, engine_mode):
|
||||
args = ['tesseract']
|
||||
if langs:
|
||||
args.extend(['-l', '+'.join(langs)])
|
||||
if engine_mode is not None and v4():
|
||||
args.extend(['--oem', str(engine_mode)])
|
||||
return args
|
||||
|
||||
|
||||
def get_orientation(input_file, engine_mode, timeout: float, log):
|
||||
args_tesseract = tess_base_args(['osd'], engine_mode) + [
|
||||
'--psm',
|
||||
'0',
|
||||
fspath(input_file),
|
||||
'stdout',
|
||||
]
|
||||
|
||||
try:
|
||||
stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout)
|
||||
except TimeoutExpired:
|
||||
return OrientationConfidence(angle=0, confidence=0.0)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_file)
|
||||
if (
|
||||
b'Too few characters. Skipping this page' in e.output
|
||||
or b'Image too large' in e.output
|
||||
):
|
||||
return OrientationConfidence(0, 0)
|
||||
raise e from e
|
||||
else:
|
||||
osd = {}
|
||||
for line in stdout.decode().splitlines():
|
||||
line = line.strip()
|
||||
parts = line.split(':', maxsplit=2)
|
||||
if len(parts) == 2:
|
||||
osd[parts[0].strip()] = parts[1].strip()
|
||||
|
||||
angle = int(osd.get('Orientation in degrees', 0))
|
||||
oc = OrientationConfidence(
|
||||
angle=angle, confidence=float(osd.get('Orientation confidence', 0))
|
||||
)
|
||||
return oc
|
||||
|
||||
|
||||
def tesseract_log_output(log, stdout, input_file):
|
||||
prefix = f"{(page_number(input_file)):4d}: [tesseract] "
|
||||
|
||||
try:
|
||||
text = stdout.decode()
|
||||
except UnicodeDecodeError:
|
||||
log.error(
|
||||
prefix
|
||||
+ "command line output was not utf-8. "
|
||||
+ "This usually means Tesseract's language packs do not match "
|
||||
"the installed version of Tesseract."
|
||||
)
|
||||
text = stdout.decode('utf-8', 'backslashreplace')
|
||||
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
if line.startswith("Tesseract Open Source"):
|
||||
continue
|
||||
elif line.startswith("Warning in pixReadMem"):
|
||||
continue
|
||||
elif 'diacritics' in line:
|
||||
log.warning(prefix + "lots of diacritics - possibly poor OCR")
|
||||
elif line.startswith('OSD: Weak margin'):
|
||||
log.warning(prefix + "unsure about page orientation")
|
||||
elif 'Error in pixScanForForeground' in line:
|
||||
pass # Appears to be spurious/problem with nonwhite borders
|
||||
elif 'Error in boxClipToRectangle' in line:
|
||||
pass # Always appears with pixScanForForeground message
|
||||
elif 'parameter not found: ' in line.lower():
|
||||
log.error(prefix + line.strip())
|
||||
problem = line.split('found: ')[1]
|
||||
raise TesseractConfigError(problem)
|
||||
elif 'error' in line.lower() or 'exception' in line.lower():
|
||||
log.error(prefix + line.strip())
|
||||
elif 'warning' in line.lower():
|
||||
log.warning(prefix + line.strip())
|
||||
elif 'read_params_file' in line.lower():
|
||||
log.error(prefix + line.strip())
|
||||
else:
|
||||
log.info(prefix + line.strip())
|
||||
|
||||
|
||||
def page_timedout(log, input_file, timeout):
|
||||
if timeout == 0:
|
||||
return
|
||||
prefix = f"{(page_number(input_file)):4d}: [tesseract] "
|
||||
log.warning(prefix + " took too long to OCR - skipping")
|
||||
|
||||
|
||||
def _generate_null_hocr(output_hocr, output_sidecar, image):
|
||||
"""Produce a .hocr file that reports no text detected on a page that is
|
||||
the same size as the input image."""
|
||||
from PIL import Image
|
||||
|
||||
im = Image.open(image)
|
||||
w, h = im.size
|
||||
|
||||
with open(output_hocr, 'w', encoding="utf-8") as f:
|
||||
f.write(HOCR_TEMPLATE.format(w, h))
|
||||
with open(output_sidecar, 'w', encoding='utf-8') as f:
|
||||
f.write('[skipped page]')
|
||||
|
||||
|
||||
def generate_hocr(
|
||||
input_file,
|
||||
output_files,
|
||||
language: list,
|
||||
engine_mode,
|
||||
tessconfig: list,
|
||||
timeout: float,
|
||||
pagesegmode: int,
|
||||
user_words,
|
||||
user_patterns,
|
||||
log,
|
||||
):
|
||||
|
||||
output_hocr = next(o for o in output_files if o.endswith('.hocr'))
|
||||
output_sidecar = next(o for o in output_files if o.endswith('.txt'))
|
||||
prefix = os.path.splitext(output_hocr)[0]
|
||||
|
||||
args_tesseract = tess_base_args(language, engine_mode)
|
||||
|
||||
if pagesegmode is not None:
|
||||
args_tesseract.extend(['--psm', str(pagesegmode)])
|
||||
|
||||
if user_words:
|
||||
args_tesseract.extend(['--user-words', user_words])
|
||||
|
||||
if user_patterns:
|
||||
args_tesseract.extend(['--user-patterns', user_patterns])
|
||||
|
||||
# Reminder: test suite tesseract spoofers will break after any changes
|
||||
# to the number of order parameters here
|
||||
args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig)
|
||||
try:
|
||||
log.debug(args_tesseract)
|
||||
stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout)
|
||||
except TimeoutExpired:
|
||||
# Generate a HOCR file with no recognized text if tesseract times out
|
||||
# Temporary workaround to hocrTransform not being able to function if
|
||||
# it does not have a valid hOCR file.
|
||||
page_timedout(log, input_file, timeout)
|
||||
_generate_null_hocr(output_hocr, output_sidecar, input_file)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_file)
|
||||
if b'Image too large' in e.output:
|
||||
_generate_null_hocr(output_hocr, output_sidecar, input_file)
|
||||
return
|
||||
|
||||
raise e from e
|
||||
else:
|
||||
tesseract_log_output(log, stdout, input_file)
|
||||
# The sidecar text file will get the suffix .txt; rename it to
|
||||
# whatever caller wants it named
|
||||
if os.path.exists(prefix + '.txt'):
|
||||
shutil.move(prefix + '.txt', output_sidecar)
|
||||
|
||||
|
||||
def use_skip_page(text_only, skip_pdf, output_pdf, output_text):
|
||||
with open(output_text, 'w') as f:
|
||||
f.write('[skipped page]')
|
||||
|
||||
if skip_pdf and not text_only:
|
||||
# Substitute a "skipped page"
|
||||
with suppress(FileNotFoundError):
|
||||
os.remove(output_pdf) # In case it was partially created
|
||||
os.symlink(skip_pdf, output_pdf)
|
||||
return
|
||||
|
||||
# Or normally, just write a 0 byte file to the output to indicate a skip
|
||||
with open(output_pdf, 'wb') as out:
|
||||
out.write(b'')
|
||||
|
||||
|
||||
def generate_pdf(
|
||||
*,
|
||||
input_image,
|
||||
skip_pdf=None,
|
||||
output_pdf,
|
||||
output_text,
|
||||
language: list,
|
||||
engine_mode,
|
||||
text_only: bool,
|
||||
tessconfig: list,
|
||||
timeout: float,
|
||||
pagesegmode: int,
|
||||
user_words,
|
||||
user_patterns,
|
||||
log,
|
||||
):
|
||||
'''Use Tesseract to render a PDF.
|
||||
|
||||
input_image -- image to analyze
|
||||
skip_pdf -- if we time out, use this file as output
|
||||
output_pdf -- file to generate
|
||||
output_text -- OCR text file
|
||||
language -- list of languages to consider
|
||||
engine_mode -- engine mode argument for tess v4
|
||||
text_only -- enable tesseract text only mode?
|
||||
tessconfig -- tesseract configuration
|
||||
timeout -- timeout (seconds)
|
||||
log -- logger object
|
||||
'''
|
||||
|
||||
args_tesseract = tess_base_args(language, engine_mode)
|
||||
|
||||
if pagesegmode is not None:
|
||||
args_tesseract.extend(['--psm', str(pagesegmode)])
|
||||
|
||||
if text_only and has_textonly_pdf():
|
||||
args_tesseract.extend(['-c', 'textonly_pdf=1'])
|
||||
|
||||
if user_words:
|
||||
args_tesseract.extend(['--user-words', user_words])
|
||||
|
||||
if user_patterns:
|
||||
args_tesseract.extend(['--user-patterns', user_patterns])
|
||||
|
||||
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
|
||||
|
||||
# Reminder: test suite tesseract spoofers might break after any changes
|
||||
# to the number of order parameters here
|
||||
|
||||
args_tesseract.extend([input_image, prefix, 'pdf', 'txt'] + tessconfig)
|
||||
|
||||
try:
|
||||
log.debug(args_tesseract)
|
||||
stdout = check_output(args_tesseract, stderr=STDOUT, timeout=timeout)
|
||||
if os.path.exists(prefix + '.txt'):
|
||||
shutil.move(prefix + '.txt', output_text)
|
||||
except TimeoutExpired:
|
||||
page_timedout(log, input_image, timeout)
|
||||
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_image)
|
||||
if b'Image too large' in e.output:
|
||||
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
|
||||
return
|
||||
raise e from e
|
||||
else:
|
||||
tesseract_log_output(log, stdout, input_image)
|
||||
@@ -1,129 +0,0 @@
|
||||
# © 2015 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
# unpaper documentation:
|
||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||
|
||||
import os
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
from subprocess import PIPE, STDOUT, CalledProcessError
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from . import get_version
|
||||
from ..exceptions import MissingDependencyError, SubprocessOutputError
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
print("Could not find Python3 imaging library", file=sys.stderr)
|
||||
raise
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def version():
|
||||
return get_version('unpaper')
|
||||
|
||||
|
||||
def run(input_file, output_file, dpi, log, mode_args):
|
||||
args_unpaper = ['unpaper', '-v', '--dpi', str(dpi)] + mode_args
|
||||
|
||||
SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'}
|
||||
|
||||
im = Image.open(input_file)
|
||||
if im.mode not in SUFFIXES.keys():
|
||||
log.info("Converting image to other colorspace")
|
||||
try:
|
||||
if im.mode == 'P' and len(im.getcolors()) == 2:
|
||||
im = im.convert(mode='1')
|
||||
else:
|
||||
im = im.convert(mode='RGB')
|
||||
except IOError as e:
|
||||
log.error("Could not convert image with type " + im.mode)
|
||||
im.close()
|
||||
raise MissingDependencyError() from e
|
||||
|
||||
try:
|
||||
suffix = SUFFIXES[im.mode]
|
||||
except KeyError:
|
||||
log.error("Failed to convert image to a supported format.")
|
||||
im.close()
|
||||
raise MissingDependencyError() from e
|
||||
|
||||
with TemporaryDirectory() as tmpdir:
|
||||
input_pnm = os.path.join(tmpdir, f'input{suffix}')
|
||||
output_pnm = os.path.join(tmpdir, f'output{suffix}')
|
||||
im.save(input_pnm, format='PPM')
|
||||
im.close()
|
||||
|
||||
# To prevent any shenanigans from accepting arbitrary parameters in
|
||||
# --unpaper-args, we:
|
||||
# 1) run with cwd set to a tmpdir with only unpaper's files
|
||||
# 2) forbid the use of '/' in arguments, to prevent changing paths
|
||||
# 3) append absolute paths for the input and output file
|
||||
# This should ensure that a user cannot clobber some other file with
|
||||
# their unpaper arguments (whether intentionally or otherwise)
|
||||
args_unpaper.extend([input_pnm, output_pnm])
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
args_unpaper,
|
||||
check=True,
|
||||
close_fds=True,
|
||||
universal_newlines=True,
|
||||
stderr=STDOUT,
|
||||
cwd=tmpdir,
|
||||
stdout=PIPE,
|
||||
)
|
||||
except CalledProcessError as e:
|
||||
log.debug(e.output)
|
||||
raise e from e
|
||||
else:
|
||||
log.debug(proc.stdout)
|
||||
# unpaper sets dpi to 72; fix this
|
||||
try:
|
||||
Image.open(output_pnm).save(output_file, dpi=(dpi, dpi))
|
||||
except (FileNotFoundError, OSError):
|
||||
raise SubprocessOutputError(
|
||||
"unpaper: failed to produce the expected output file. Called with: "
|
||||
+ str(args_unpaper)
|
||||
) from None
|
||||
|
||||
|
||||
def validate_custom_args(args: str):
|
||||
unpaper_args = shlex.split(args)
|
||||
if any('/' in arg for arg in unpaper_args):
|
||||
raise ValueError('No filenames allowed in --unpaper-args')
|
||||
return unpaper_args
|
||||
|
||||
|
||||
def clean(input_file, output_file, dpi, log, unpaper_args=None):
|
||||
default_args = [
|
||||
'--layout',
|
||||
'none',
|
||||
'--mask-scan-size',
|
||||
'100', # don't blank out narrow columns
|
||||
'--no-border-align', # don't align visible content to borders
|
||||
'--no-mask-center', # don't center visible content within page
|
||||
'--no-grayfilter', # don't remove light gray areas
|
||||
'--no-blackfilter', # don't remove solid black areas
|
||||
'--no-deskew', # don't deskew
|
||||
]
|
||||
if not unpaper_args:
|
||||
unpaper_args = default_args
|
||||
run(input_file, output_file, dpi, log, unpaper_args)
|
||||
191
src/ocrmypdf/extra_plugins/semfree.py
Normal file
191
src/ocrmypdf/extra_plugins/semfree.py
Normal file
@@ -0,0 +1,191 @@
|
||||
# © 2021 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Semaphore-free alternate executor.
|
||||
|
||||
There are two popular environments that do not fully support the standard Python
|
||||
multiprocessing module: AWS Lambda, and Termux (a terminal emulator for Android).
|
||||
|
||||
This alternate executor divvies up work among worker processes before processing,
|
||||
rather than having each worker consume work from a shared queue when they finish
|
||||
their task. This means workers have no need to coordinate with each other. Each
|
||||
worker communicates only with the main process.
|
||||
|
||||
This is not without drawbacks. If the tasks are not "even" in size, which cannot
|
||||
be guaranteed, some workers may end up with too much work while others are idle.
|
||||
It is less efficient than the standard implementation, so not th edefault.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import logging.handlers
|
||||
import signal
|
||||
from contextlib import suppress
|
||||
from enum import Enum, auto
|
||||
from itertools import islice, repeat, takewhile, zip_longest
|
||||
from multiprocessing import Pipe, Process
|
||||
from multiprocessing.connection import Connection, wait
|
||||
from typing import Callable, Iterable, Iterator
|
||||
|
||||
from ocrmypdf import Executor, hookimpl
|
||||
from ocrmypdf._concurrent import NullProgressBar
|
||||
from ocrmypdf.exceptions import InputFileError
|
||||
from ocrmypdf.helpers import remove_all_log_handlers
|
||||
|
||||
|
||||
class MessageType(Enum):
|
||||
exception = auto()
|
||||
result = auto()
|
||||
complete = auto()
|
||||
|
||||
|
||||
def split_every(n: int, iterable: Iterable) -> Iterator:
|
||||
"""Split iterable into groups of n.
|
||||
|
||||
>>> list(split_every(4, range(10)))
|
||||
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
|
||||
|
||||
https://stackoverflow.com/a/22919323
|
||||
"""
|
||||
iterator = iter(iterable)
|
||||
return takewhile(bool, (list(islice(iterator, n)) for _ in repeat(None)))
|
||||
|
||||
|
||||
def process_sigbus(*args):
|
||||
raise InputFileError("A worker process lost access to an input file")
|
||||
|
||||
|
||||
class ConnectionLogHandler(logging.handlers.QueueHandler):
|
||||
def __init__(self, conn: Connection) -> None:
|
||||
super().__init__(None)
|
||||
self.conn = conn
|
||||
|
||||
def enqueue(self, record):
|
||||
self.conn.send(('log', record))
|
||||
|
||||
|
||||
def process_loop(
|
||||
conn: Connection, user_init: Callable[[], None], loglevel, task, task_args
|
||||
):
|
||||
"""Initialize a process pool worker"""
|
||||
|
||||
# Install SIGBUS handler (so our parent process can abort somewhat gracefully)
|
||||
with suppress(AttributeError): # Windows and Cygwin do not have SIGBUS
|
||||
# Windows and Cygwin do not have pthread_sigmask or SIGBUS
|
||||
signal.signal(signal.SIGBUS, process_sigbus)
|
||||
|
||||
# Reconfigure the root logger for this process to send all messages to a queue
|
||||
h = ConnectionLogHandler(conn)
|
||||
root = logging.getLogger()
|
||||
remove_all_log_handlers(root)
|
||||
root.setLevel(loglevel)
|
||||
root.addHandler(h)
|
||||
|
||||
user_init()
|
||||
|
||||
for args in task_args:
|
||||
try:
|
||||
result = task(args)
|
||||
except Exception as e:
|
||||
conn.send((MessageType.exception, e))
|
||||
break
|
||||
else:
|
||||
conn.send((MessageType.result, result))
|
||||
|
||||
conn.send((MessageType.complete, None))
|
||||
conn.close()
|
||||
return
|
||||
|
||||
|
||||
class LambdaExecutor(Executor):
|
||||
def _execute(
|
||||
self,
|
||||
*,
|
||||
use_threads: bool,
|
||||
max_workers: int,
|
||||
tqdm_kwargs: dict,
|
||||
worker_initializer: Callable,
|
||||
task: Callable,
|
||||
task_arguments: Iterable,
|
||||
task_finished: Callable,
|
||||
):
|
||||
if use_threads and max_workers == 1:
|
||||
with self.pbar_class(**tqdm_kwargs) as pbar:
|
||||
for args in task_arguments:
|
||||
result = task(args)
|
||||
task_finished(result, pbar)
|
||||
return
|
||||
|
||||
task_arguments = list(task_arguments)
|
||||
grouped_args = list(
|
||||
zip_longest(*list(split_every(max_workers, task_arguments)))
|
||||
)
|
||||
if not grouped_args:
|
||||
return
|
||||
|
||||
processes = []
|
||||
connections = []
|
||||
for chunk in grouped_args:
|
||||
parent_conn, child_conn = Pipe()
|
||||
|
||||
worker_args = [args for args in chunk if args is not None]
|
||||
process = Process(
|
||||
target=process_loop,
|
||||
args=(
|
||||
child_conn,
|
||||
worker_initializer,
|
||||
logging.getLogger("").level,
|
||||
task,
|
||||
worker_args,
|
||||
),
|
||||
)
|
||||
process.daemon = True
|
||||
processes.append(process)
|
||||
connections.append(parent_conn)
|
||||
|
||||
for process in processes:
|
||||
process.start()
|
||||
|
||||
with self.pbar_class(**tqdm_kwargs) as pbar:
|
||||
while connections:
|
||||
for r in wait(connections):
|
||||
try:
|
||||
msg_type, msg = r.recv()
|
||||
except EOFError:
|
||||
connections.remove(r)
|
||||
continue
|
||||
|
||||
if msg_type == MessageType.result:
|
||||
if task_finished:
|
||||
task_finished(msg, pbar)
|
||||
elif msg_type == 'log':
|
||||
record = msg
|
||||
logger = logging.getLogger(record.name)
|
||||
logger.handle(record)
|
||||
elif msg_type == MessageType.complete:
|
||||
connections.remove(r)
|
||||
elif msg_type == MessageType.exception:
|
||||
for process in processes:
|
||||
process.terminate()
|
||||
raise msg
|
||||
|
||||
for process in processes:
|
||||
process.join()
|
||||
|
||||
|
||||
@hookimpl
|
||||
def get_executor(progressbar_class):
|
||||
return LambdaExecutor(pbar_class=progressbar_class)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def get_logging_console():
|
||||
return logging.StreamHandler()
|
||||
|
||||
|
||||
@hookimpl
|
||||
def get_progressbar_class():
|
||||
return NullProgressBar
|
||||
@@ -1,46 +1,88 @@
|
||||
# © 2016 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
import logging
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import warnings
|
||||
from collections import namedtuple
|
||||
from collections.abc import Iterable
|
||||
from contextlib import suppress
|
||||
from functools import partial, wraps
|
||||
from functools import wraps
|
||||
from io import StringIO
|
||||
from math import isclose, isfinite
|
||||
from pathlib import Path
|
||||
from typing import Any, Sequence
|
||||
|
||||
import pikepdf
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def re_symlink(input_file, soft_link_name, log=None):
|
||||
"""
|
||||
Helper function: relinks soft symbolic link if necessary
|
||||
class Resolution(namedtuple('Resolution', ('x', 'y'))):
|
||||
"""The number of pixels per inch in each 2D direction."""
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
def round(self, ndigits: int):
|
||||
return Resolution(round(self.x, ndigits), round(self.y, ndigits))
|
||||
|
||||
def to_int(self):
|
||||
return Resolution(int(round(self.x)), int(round(self.y)))
|
||||
|
||||
@property
|
||||
def is_square(self) -> bool:
|
||||
return isclose(self.x, self.y, rel_tol=1e-3)
|
||||
|
||||
@property
|
||||
def is_finite(self) -> bool:
|
||||
return isfinite(self.x) and isfinite(self.y)
|
||||
|
||||
def take_max(self, vals, yvals=None):
|
||||
if yvals is not None:
|
||||
return Resolution(max(self.x, *vals), max(self.y, *yvals))
|
||||
max_x, max_y = self.x, self.y
|
||||
for x, y in vals:
|
||||
max_x = max(x, max_x)
|
||||
max_y = max(y, max_y)
|
||||
return Resolution(max_x, max_y)
|
||||
|
||||
def flip_axis(self):
|
||||
return Resolution(self.y, self.x)
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.x:f}x{self.y:f}"
|
||||
|
||||
def __repr__(self): # pragma: no cover
|
||||
return f"Resolution({self.x}x{self.y} dpi)"
|
||||
|
||||
|
||||
class NeverRaise(Exception):
|
||||
"""An exception that is never raised"""
|
||||
|
||||
|
||||
def safe_symlink(input_file: os.PathLike, soft_link_name: os.PathLike):
|
||||
"""Create a symbolic link at ``soft_link_name``, which references ``input_file``.
|
||||
|
||||
Think of this as copying ``input_file`` to ``soft_link_name`` with less overhead.
|
||||
|
||||
Use symlinks safely. Self-linking loops are prevented. On Windows, file copy is
|
||||
used since symlinks may require administrator privileges. An existing link at the
|
||||
destination is removed.
|
||||
"""
|
||||
input_file = os.fspath(input_file)
|
||||
soft_link_name = os.fspath(soft_link_name)
|
||||
if log is None:
|
||||
prdebug = partial(print, file=sys.stderr)
|
||||
else:
|
||||
prdebug = log.debug
|
||||
|
||||
# Guard against soft linking to oneself
|
||||
if input_file == soft_link_name:
|
||||
prdebug(
|
||||
"Warning: No symbolic link made. You are using "
|
||||
+ "the original data directory as the working directory."
|
||||
log.warning(
|
||||
"No symbolic link created. You are using the original data directory "
|
||||
"as the working directory."
|
||||
)
|
||||
return
|
||||
|
||||
@@ -48,90 +90,165 @@ def re_symlink(input_file, soft_link_name, log=None):
|
||||
if os.path.lexists(soft_link_name):
|
||||
# do not delete or overwrite real (non-soft link) file
|
||||
if not os.path.islink(soft_link_name):
|
||||
raise FileExistsError("%s exists and is not a link" % soft_link_name)
|
||||
try:
|
||||
os.unlink(soft_link_name)
|
||||
except OSError:
|
||||
prdebug("Can't unlink %s" % (soft_link_name))
|
||||
raise FileExistsError(f"{soft_link_name} exists and is not a link")
|
||||
os.unlink(soft_link_name)
|
||||
|
||||
if not os.path.exists(input_file):
|
||||
raise FileNotFoundError("trying to create a broken symlink to %s" % input_file)
|
||||
raise FileNotFoundError(f"trying to create a broken symlink to {input_file}")
|
||||
|
||||
prdebug("os.symlink(%s, %s)" % (input_file, soft_link_name))
|
||||
if os.name == 'nt':
|
||||
# Don't actually use symlinks on Windows due to permission issues
|
||||
shutil.copyfile(input_file, soft_link_name)
|
||||
return
|
||||
|
||||
log.debug("os.symlink(%s, %s)", input_file, soft_link_name)
|
||||
|
||||
# Create symbolic link using absolute path
|
||||
os.symlink(os.path.abspath(input_file), soft_link_name)
|
||||
|
||||
|
||||
def is_iterable_notstr(thing):
|
||||
def samefile(f1: os.PathLike, f2: os.PathLike):
|
||||
if os.name == 'nt':
|
||||
return f1 == f2
|
||||
else:
|
||||
return os.path.samefile(f1, f2)
|
||||
|
||||
|
||||
def is_iterable_notstr(thing: Any) -> bool:
|
||||
"""Is this is an iterable type, other than a string?"""
|
||||
return isinstance(thing, Iterable) and not isinstance(thing, str)
|
||||
|
||||
|
||||
def page_number(input_file):
|
||||
def monotonic(L: Sequence) -> bool:
|
||||
"""Does this sequence increase monotonically?"""
|
||||
return all(b > a for a, b in zip(L, L[1:]))
|
||||
|
||||
|
||||
def page_number(input_file: os.PathLike) -> int:
|
||||
"""Get one-based page number implied by filename (000002.pdf -> 2)"""
|
||||
return int(os.path.basename(os.fspath(input_file))[0:6])
|
||||
|
||||
|
||||
def available_cpu_count():
|
||||
def available_cpu_count() -> int:
|
||||
"""Returns number of CPUs in the system."""
|
||||
try:
|
||||
return multiprocessing.cpu_count()
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import psutil
|
||||
|
||||
return psutil.cpu_count()
|
||||
except (ImportError, AttributeError):
|
||||
pass
|
||||
|
||||
warnings.warn(
|
||||
"Could not get CPU count. Assuming one (1) CPU." "Use -j N to set manually."
|
||||
)
|
||||
return 1
|
||||
|
||||
|
||||
def is_file_writable(test_file):
|
||||
def is_file_writable(test_file: os.PathLike) -> bool:
|
||||
"""Intentionally racy test if target is writable.
|
||||
|
||||
We intend to write to the output file if and only if we succeed and
|
||||
can replace it atomically. Before doing the OCR work, make sure
|
||||
the location is writable.
|
||||
"""
|
||||
p = Path(test_file)
|
||||
try:
|
||||
p = Path(test_file)
|
||||
if p.is_symlink():
|
||||
p = p.resolve(strict=False)
|
||||
|
||||
if p.is_symlink():
|
||||
p = p.resolve(strict=False)
|
||||
# p.is_file() throws an exception in some cases
|
||||
if p.exists() and p.is_file():
|
||||
return os.access(
|
||||
os.fspath(p),
|
||||
os.W_OK,
|
||||
effective_ids=(os.access in os.supports_effective_ids),
|
||||
)
|
||||
else:
|
||||
try:
|
||||
fp = p.open('wb')
|
||||
except OSError:
|
||||
return False
|
||||
else:
|
||||
fp.close()
|
||||
with suppress(OSError):
|
||||
p.unlink()
|
||||
return True
|
||||
except (EnvironmentError, RuntimeError) as e:
|
||||
log.debug(e)
|
||||
log.error(str(e))
|
||||
return False
|
||||
|
||||
# p.is_file() throws an exception in some cases
|
||||
if p.exists() and p.is_file():
|
||||
return os.access(
|
||||
os.fspath(p),
|
||||
os.W_OK,
|
||||
effective_ids=(os.access in os.supports_effective_ids),
|
||||
)
|
||||
|
||||
def check_pdf(input_file: Path) -> bool:
|
||||
"""Check if a PDF complies with the PDF specification.
|
||||
|
||||
Checks for proper formatting and proper linearization. Uses pikepdf (which in
|
||||
turn, uses QPDF) to perform the checks.
|
||||
"""
|
||||
try:
|
||||
pdf = pikepdf.open(input_file)
|
||||
except pikepdf.PdfError as e:
|
||||
log.error(e)
|
||||
return False
|
||||
else:
|
||||
try:
|
||||
fp = p.open('wb')
|
||||
except OSError:
|
||||
with pdf:
|
||||
messages = pdf.check()
|
||||
for msg in messages:
|
||||
if 'error' in msg.lower():
|
||||
log.error(msg)
|
||||
else:
|
||||
log.warning(msg)
|
||||
|
||||
sio = StringIO()
|
||||
linearize_msgs = ''
|
||||
try:
|
||||
# If linearization is missing entirely, we do not complain. We do
|
||||
# complain if linearization is present but incorrect.
|
||||
pdf.check_linearization(sio)
|
||||
except RuntimeError:
|
||||
pass
|
||||
except (
|
||||
# Workaround for a problematic pikepdf version
|
||||
# pragma: no cover
|
||||
getattr(pikepdf, 'ForeignObjectError')
|
||||
if pikepdf.__version__ == '2.1.0'
|
||||
else NeverRaise
|
||||
):
|
||||
pass
|
||||
else:
|
||||
linearize_msgs = sio.getvalue()
|
||||
if linearize_msgs:
|
||||
log.warning(linearize_msgs)
|
||||
|
||||
if not messages and not linearize_msgs:
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
fp.close()
|
||||
with suppress(OSError):
|
||||
p.unlink()
|
||||
return True
|
||||
|
||||
|
||||
def flatten_groups(groups):
|
||||
for obj in groups:
|
||||
if is_iterable_notstr(obj):
|
||||
yield from obj
|
||||
else:
|
||||
yield obj
|
||||
def clamp(n, smallest, largest): # mypy doesn't understand types for this
|
||||
"""Clamps the value of ``n`` to between ``smallest`` and ``largest``."""
|
||||
return max(smallest, min(n, largest))
|
||||
|
||||
|
||||
def remove_all_log_handlers(logger):
|
||||
"Remove all log handlers, usually used in a child process."
|
||||
for handler in logger.handlers[:]:
|
||||
logger.removeHandler(handler)
|
||||
handler.close() # To ensure handlers with opened resources are released
|
||||
|
||||
|
||||
def pikepdf_enable_mmap():
|
||||
# try:
|
||||
# if pikepdf._qpdf.set_access_default_mmap(True):
|
||||
# log.debug("pikepdf mmap enabled")
|
||||
# except AttributeError:
|
||||
# log.debug("pikepdf mmap not available")
|
||||
# We found a race condition probably related to pybind issue #2252 that can
|
||||
# cause a crash. For now, disable pikepdf mmap to be on the safe side.
|
||||
# Fix is not in pybind11 2.6.0
|
||||
# log.debug("pikepdf mmap disabled")
|
||||
return
|
||||
|
||||
|
||||
def deprecated(func):
|
||||
"""Warn that function is deprecated"""
|
||||
"""Warn that function is deprecated."""
|
||||
|
||||
@wraps(func)
|
||||
def new_func(*args, **kwargs):
|
||||
|
||||
@@ -29,15 +29,28 @@
|
||||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
from collections import namedtuple
|
||||
from itertools import chain
|
||||
from math import atan, cos, sin
|
||||
from pathlib import Path
|
||||
from typing import Any, NamedTuple, Optional, Tuple, Union
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from reportlab.lib.colors import black, cyan, magenta, red
|
||||
from reportlab.lib.units import inch
|
||||
from reportlab.pdfgen.canvas import Canvas
|
||||
|
||||
Rect = namedtuple('Rect', ['x1', 'y1', 'x2', 'y2'])
|
||||
Element = ElementTree.Element
|
||||
|
||||
|
||||
class Rect(NamedTuple): # pylint: disable=inherit-non-class
|
||||
"""A rectangle for managing PDF coordinates."""
|
||||
|
||||
x1: Any
|
||||
y1: Any
|
||||
x2: Any
|
||||
y2: Any
|
||||
|
||||
|
||||
class HocrTransformError(Exception):
|
||||
@@ -64,9 +77,9 @@ class HocrTransform:
|
||||
{'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'fi': 'fi', 'fl': 'fl'}
|
||||
)
|
||||
|
||||
def __init__(self, hocrFileName, dpi):
|
||||
def __init__(self, *, hocr_filename: Union[str, Path], dpi: float):
|
||||
self.dpi = dpi
|
||||
self.hocr = ElementTree.parse(hocrFileName)
|
||||
self.hocr = ElementTree.parse(os.fspath(hocr_filename))
|
||||
|
||||
# if the hOCR file has a namespace, ElementTree requires its use to
|
||||
# find elements
|
||||
@@ -77,7 +90,7 @@ class HocrTransform:
|
||||
|
||||
# get dimension in pt (not pixel!!!!) of the OCRed image
|
||||
self.width, self.height = None, None
|
||||
for div in self.hocr.findall(".//%sdiv[@class='ocr_page']" % (self.xmlns)):
|
||||
for div in self.hocr.findall(self._child_xpath('div', 'ocr_page')):
|
||||
coords = self.element_coordinates(div)
|
||||
pt_coords = self.pt_from_pixel(coords)
|
||||
self.width = pt_coords.x2 - pt_coords.x1
|
||||
@@ -88,38 +101,38 @@ class HocrTransform:
|
||||
if self.width is None or self.height is None:
|
||||
raise HocrTransformError("hocr file is missing page dimensions")
|
||||
|
||||
def __str__(self):
|
||||
def __str__(self): # pragma: no cover
|
||||
"""
|
||||
Return the textual content of the HTML body
|
||||
"""
|
||||
if self.hocr is None:
|
||||
return ''
|
||||
body = self.hocr.find(".//%sbody" % (self.xmlns))
|
||||
body = self.hocr.find(self._child_xpath('body'))
|
||||
if body:
|
||||
return self._get_element_text(body)
|
||||
else:
|
||||
return ''
|
||||
|
||||
def _get_element_text(self, element):
|
||||
def _get_element_text(self, element: Element):
|
||||
"""
|
||||
Return the textual content of the element and its children
|
||||
"""
|
||||
text = ''
|
||||
if element.text is not None:
|
||||
text += element.text
|
||||
for child in element.getchildren():
|
||||
for child in element:
|
||||
text += self._get_element_text(child)
|
||||
if element.tail is not None:
|
||||
text += element.tail
|
||||
return text
|
||||
|
||||
@classmethod
|
||||
def element_coordinates(cls, element):
|
||||
def element_coordinates(cls, element: Element) -> Rect:
|
||||
"""
|
||||
Returns a tuple containing the coordinates of the bounding box around
|
||||
an element
|
||||
"""
|
||||
out = (0, 0, 0, 0)
|
||||
out = Rect._make(0 for _ in range(4))
|
||||
if 'title' in element.attrib:
|
||||
matches = cls.box_pattern.search(element.attrib['title'])
|
||||
if matches:
|
||||
@@ -128,7 +141,7 @@ class HocrTransform:
|
||||
return out
|
||||
|
||||
@classmethod
|
||||
def baseline(cls, element):
|
||||
def baseline(cls, element: Element) -> Tuple[float, float]:
|
||||
"""
|
||||
Returns a tuple containing the baseline slope and intercept.
|
||||
"""
|
||||
@@ -136,32 +149,47 @@ class HocrTransform:
|
||||
matches = cls.baseline_pattern.search(element.attrib['title'])
|
||||
if matches:
|
||||
return float(matches.group(1)), int(matches.group(2))
|
||||
return (0, 0)
|
||||
return (0.0, 0.0)
|
||||
|
||||
def pt_from_pixel(self, pxl):
|
||||
def pt_from_pixel(self, pxl) -> Rect:
|
||||
"""
|
||||
Returns the quantity in PDF units (pt) given quantity in pixels
|
||||
"""
|
||||
return Rect._make((c / self.dpi * inch) for c in pxl)
|
||||
|
||||
def _child_xpath(self, html_tag: str, html_class: Optional[str] = None) -> str:
|
||||
xpath = f".//{self.xmlns}{html_tag}"
|
||||
if html_class:
|
||||
xpath += f"[@class='{html_class}']"
|
||||
return xpath
|
||||
|
||||
@classmethod
|
||||
def replace_unsupported_chars(cls, s):
|
||||
def replace_unsupported_chars(cls, s: str) -> str:
|
||||
"""
|
||||
Given an input string, returns the corresponding string that:
|
||||
- is available in the helvetica facetype
|
||||
- does not contain any ligature (to allow easy search in the PDF file)
|
||||
* is available in the Helvetica facetype
|
||||
* does not contain any ligature (to allow easy search in the PDF file)
|
||||
"""
|
||||
return s.translate(cls.ligatures)
|
||||
|
||||
def topdown_position(self, element):
|
||||
pxl_line_coords = self.element_coordinates(element)
|
||||
line_box = self.pt_from_pixel(pxl_line_coords)
|
||||
# Coordinates here are still in the hocr coordinate system, so 0 on the y axis
|
||||
# is the top of the page and increasing values of y will move towards the
|
||||
# bottom of the page.
|
||||
return line_box.y2
|
||||
|
||||
def to_pdf(
|
||||
self,
|
||||
outFileName,
|
||||
imageFileName=None,
|
||||
showBoundingboxes=False,
|
||||
fontname="Helvetica",
|
||||
invisibleText=False,
|
||||
interwordSpaces=False,
|
||||
):
|
||||
*,
|
||||
out_filename: Path,
|
||||
image_filename: Optional[Path] = None,
|
||||
show_bounding_boxes: bool = False,
|
||||
fontname: str = "Helvetica",
|
||||
invisible_text: bool = False,
|
||||
interword_spaces: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Creates a PDF file with an image superimposed on top of the text.
|
||||
Text is positioned according to the bounding box of the lines in
|
||||
@@ -169,19 +197,36 @@ class HocrTransform:
|
||||
The image need not be identical to the image used to create the hOCR
|
||||
file.
|
||||
It can have a lower resolution, different color mode, etc.
|
||||
|
||||
Arguments:
|
||||
out_filename: Path of PDF to write.
|
||||
image_filename: Image to use for this file. If omitted, the OCR text
|
||||
is shown.
|
||||
show_bounding_boxes: Show bounding boxes around various text regions,
|
||||
for debugging.
|
||||
fontname: Name of font to use.
|
||||
invisible_text: If True, text is rendered invisible so that is
|
||||
selectable but never drawn. If False, text is visible and may
|
||||
be seen if the image is skipped or deleted in Acrobat.
|
||||
interword_spaces: If True, insert spaces between words rather than
|
||||
drawing each word without spaces. Generally this improves text
|
||||
extraction.
|
||||
"""
|
||||
# create the PDF file
|
||||
# page size in points (1/72 in.)
|
||||
pdf = Canvas(outFileName, pagesize=(self.width, self.height), pageCompression=1)
|
||||
pdf = Canvas(
|
||||
os.fspath(out_filename),
|
||||
pagesize=(self.width, self.height),
|
||||
pageCompression=1,
|
||||
)
|
||||
|
||||
# draw bounding box for each paragraph
|
||||
# light blue for bounding box of paragraph
|
||||
pdf.setStrokeColorRGB(0, 1, 1)
|
||||
pdf.setStrokeColor(cyan)
|
||||
# light blue for bounding box of paragraph
|
||||
pdf.setFillColorRGB(0, 1, 1)
|
||||
pdf.setFillColor(cyan)
|
||||
pdf.setLineWidth(0) # no line for bounding box
|
||||
for elem in self.hocr.findall(".//%sp[@class='%s']" % (self.xmlns, "ocr_par")):
|
||||
|
||||
for elem in self.hocr.iterfind(self._child_xpath('p', 'ocr_par')):
|
||||
elemtxt = self._get_element_text(elem).rstrip()
|
||||
if len(elemtxt) == 0:
|
||||
continue
|
||||
@@ -190,14 +235,19 @@ class HocrTransform:
|
||||
pt = self.pt_from_pixel(pxl_coords)
|
||||
|
||||
# draw the bbox border
|
||||
if showBoundingboxes:
|
||||
if show_bounding_boxes: # pragma: no cover
|
||||
pdf.rect(
|
||||
pt.x1, self.height - pt.y2, pt.x2 - pt.x1, pt.y2 - pt.y1, fill=1
|
||||
)
|
||||
|
||||
found_lines = False
|
||||
for line in self.hocr.findall(
|
||||
".//%sspan[@class='%s']" % (self.xmlns, "ocr_line")
|
||||
for line in sorted(
|
||||
chain(
|
||||
self.hocr.iterfind(self._child_xpath('span', 'ocr_header')),
|
||||
self.hocr.iterfind(self._child_xpath('span', 'ocr_line')),
|
||||
self.hocr.iterfind(self._child_xpath('span', 'ocr_textfloat')),
|
||||
),
|
||||
key=self.topdown_position,
|
||||
):
|
||||
found_lines = True
|
||||
self._do_line(
|
||||
@@ -205,45 +255,49 @@ class HocrTransform:
|
||||
line,
|
||||
"ocrx_word",
|
||||
fontname,
|
||||
invisibleText,
|
||||
interwordSpaces,
|
||||
showBoundingboxes,
|
||||
invisible_text,
|
||||
interword_spaces,
|
||||
show_bounding_boxes,
|
||||
)
|
||||
|
||||
if not found_lines:
|
||||
# Tesseract did not report any lines (just words)
|
||||
root = self.hocr.find(".//%sdiv[@class='%s']" % (self.xmlns, "ocr_page"))
|
||||
root = self.hocr.find(self._child_xpath('div', 'ocr_page'))
|
||||
self._do_line(
|
||||
pdf,
|
||||
root,
|
||||
"ocrx_word",
|
||||
fontname,
|
||||
invisibleText,
|
||||
interwordSpaces,
|
||||
showBoundingboxes,
|
||||
invisible_text,
|
||||
interword_spaces,
|
||||
show_bounding_boxes,
|
||||
)
|
||||
# put the image on the page, scaled to fill the page
|
||||
if imageFileName is not None:
|
||||
pdf.drawImage(imageFileName, 0, 0, width=self.width, height=self.height)
|
||||
if image_filename is not None:
|
||||
pdf.drawImage(
|
||||
os.fspath(image_filename), 0, 0, width=self.width, height=self.height
|
||||
)
|
||||
|
||||
# finish up the page and save it
|
||||
pdf.showPage()
|
||||
pdf.save()
|
||||
|
||||
@classmethod
|
||||
def polyval(cls, poly, x):
|
||||
def polyval(cls, poly, x): # pragma: no cover
|
||||
return x * poly[0] + poly[1]
|
||||
|
||||
def _do_line(
|
||||
self,
|
||||
pdf,
|
||||
line,
|
||||
elemclass,
|
||||
fontname,
|
||||
invisibleText,
|
||||
interwordSpaces,
|
||||
showBoundingboxes,
|
||||
pdf: Canvas,
|
||||
line: Optional[Element],
|
||||
elemclass: str,
|
||||
fontname: str,
|
||||
invisible_text: bool,
|
||||
interword_spaces: bool,
|
||||
show_bounding_boxes: bool,
|
||||
):
|
||||
if not line:
|
||||
return
|
||||
pxl_line_coords = self.element_coordinates(line)
|
||||
line_box = self.pt_from_pixel(pxl_line_coords)
|
||||
line_height = line_box.y2 - line_box.y1
|
||||
@@ -262,17 +316,17 @@ class HocrTransform:
|
||||
# on a sloped baseline and the edge of the bounding box.
|
||||
fontsize = (line_height - abs(intercept)) / cos_a
|
||||
text.setFont(fontname, fontsize)
|
||||
if invisibleText:
|
||||
if invisible_text:
|
||||
text.setTextRenderMode(3) # Invisible (indicates OCR text)
|
||||
|
||||
# Intercept is normally negative, so this places it above the bottom
|
||||
# of the line box
|
||||
baseline_y2 = self.height - (line_box.y2 + intercept)
|
||||
|
||||
if showBoundingboxes:
|
||||
if show_bounding_boxes: # pragma: no cover
|
||||
# draw the baseline in magenta, dashed
|
||||
pdf.setDash()
|
||||
pdf.setStrokeColorRGB(0.95, 0.65, 0.95)
|
||||
pdf.setStrokeColor(magenta)
|
||||
pdf.setLineWidth(0.5)
|
||||
# negate slope because it is defined as a rise/run in pixel
|
||||
# coordinates and page coordinates have the y axis flipped
|
||||
@@ -284,12 +338,12 @@ class HocrTransform:
|
||||
)
|
||||
# light green for bounding box of word/line
|
||||
pdf.setDash(6, 3)
|
||||
pdf.setStrokeColorRGB(1, 0, 0)
|
||||
pdf.setStrokeColor(red)
|
||||
|
||||
text.setTextTransform(cos_a, -sin_a, sin_a, cos_a, line_box.x1, baseline_y2)
|
||||
pdf.setFillColorRGB(0, 0, 0) # text in black
|
||||
pdf.setFillColor(black) # text in black
|
||||
|
||||
elements = line.findall(".//%sspan[@class='%s']" % (self.xmlns, elemclass))
|
||||
elements = line.findall(self._child_xpath('span', elemclass))
|
||||
for elem in elements:
|
||||
elemtxt = self._get_element_text(elem).strip()
|
||||
elemtxt = self.replace_unsupported_chars(elemtxt)
|
||||
@@ -298,7 +352,7 @@ class HocrTransform:
|
||||
|
||||
pxl_coords = self.element_coordinates(elem)
|
||||
box = self.pt_from_pixel(pxl_coords)
|
||||
if interwordSpaces:
|
||||
if interword_spaces:
|
||||
# if `--interword-spaces` is true, append a space
|
||||
# to the end of each text element to allow simpler PDF viewers
|
||||
# such as PDF.js to better recognize words in search and copy
|
||||
@@ -318,7 +372,7 @@ class HocrTransform:
|
||||
font_width = pdf.stringWidth(elemtxt, fontname, fontsize)
|
||||
|
||||
# draw the bbox border
|
||||
if showBoundingboxes:
|
||||
if show_bounding_boxes: # pragma: no cover
|
||||
pdf.rect(
|
||||
box.x1, self.height - line_box.y2, box_width, line_height, fill=0
|
||||
)
|
||||
@@ -380,10 +434,10 @@ if __name__ == "__main__":
|
||||
parser.add_argument('outputfile', help='Path to the PDF file to be generated')
|
||||
args = parser.parse_args()
|
||||
|
||||
hocr = HocrTransform(args.hocrfile, args.resolution)
|
||||
hocr = HocrTransform(hocr_filename=args.hocrfile, dpi=args.resolution)
|
||||
hocr.to_pdf(
|
||||
args.outputfile,
|
||||
args.image,
|
||||
args.boundingboxes,
|
||||
interwordSpaces=args.interword_spaces,
|
||||
out_filename=args.outputfile,
|
||||
image_filename=args.image,
|
||||
show_bounding_boxes=args.boundingboxes,
|
||||
interword_spaces=args.interword_spaces,
|
||||
)
|
||||
|
||||
@@ -3,20 +3,10 @@
|
||||
#
|
||||
# © 2013-16: jbarlow83 from Github (https://github.com/jbarlow83)
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#
|
||||
# Python FFI wrapper for Leptonica library
|
||||
|
||||
@@ -24,30 +14,83 @@ import argparse
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
import threading
|
||||
from collections import deque
|
||||
from collections.abc import Sequence
|
||||
from contextlib import suppress
|
||||
from ctypes.util import find_library
|
||||
from functools import lru_cache
|
||||
from io import BytesIO
|
||||
from io import BytesIO, UnsupportedOperation
|
||||
from os import fspath
|
||||
from tempfile import TemporaryFile
|
||||
from warnings import warn
|
||||
|
||||
from .lib._leptonica import ffi
|
||||
from ocrmypdf.exceptions import MissingDependencyError
|
||||
from ocrmypdf.lib._leptonica import ffi
|
||||
|
||||
# pylint: disable=protected-access
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
lept = ffi.dlopen(find_library('lept'))
|
||||
lept.setMsgSeverity(lept.L_SEVERITY_WARNING)
|
||||
if os.name == 'nt':
|
||||
from ocrmypdf.subprocess._windows import shim_env_path
|
||||
|
||||
libname = 'liblept-5'
|
||||
os.environ['PATH'] = shim_env_path()
|
||||
else:
|
||||
libname = 'lept'
|
||||
_libpath = find_library(libname)
|
||||
if not _libpath:
|
||||
raise MissingDependencyError(
|
||||
"""
|
||||
---------------------------------------------------------------------
|
||||
This error normally occurs when ocrmypdf can't find the Leptonica
|
||||
library, which is usually installed with Tesseract OCR. It could be that
|
||||
Tesseract is not installed properly, we can't find the installation
|
||||
on your system PATH environment variable.
|
||||
|
||||
The library we are looking for is usually called:
|
||||
liblept-5.dll (Windows)
|
||||
liblept*.dylib (macOS)
|
||||
liblept*.so (Linux/BSD)
|
||||
|
||||
Please review our installation procedures to find a solution:
|
||||
https://ocrmypdf.readthedocs.io/en/latest/installation.html
|
||||
---------------------------------------------------------------------
|
||||
"""
|
||||
)
|
||||
if os.name == 'nt':
|
||||
# On Windows, recent versions of libpng require zlib. We have to make sure
|
||||
# the zlib version being loaded is the same one that libpng was built with.
|
||||
# This tries to import zlib from Tesseract's installation folder, falling back
|
||||
# to find_library() if liblept is being loaded from somewhere else.
|
||||
# Loading zlib from other places could cause a version mismatch
|
||||
_zlib_path = os.path.join(os.path.dirname(_libpath), 'zlib1.dll')
|
||||
if not os.path.exists(_zlib_path):
|
||||
_zlib_path = find_library('zlib')
|
||||
try:
|
||||
zlib = ffi.dlopen(_zlib_path)
|
||||
except ffi.error as e:
|
||||
raise MissingDependencyError(
|
||||
"""
|
||||
Could not load the zlib library. It could be that Tesseract is not installed properly,
|
||||
we can't find the installation on your system PATH environment variable.
|
||||
"""
|
||||
) from e
|
||||
try:
|
||||
lept = ffi.dlopen(_libpath)
|
||||
lept.setMsgSeverity(lept.L_SEVERITY_WARNING)
|
||||
except ffi.error as e:
|
||||
raise MissingDependencyError(
|
||||
f"Leptonica library found at {_libpath}, but we could not access it"
|
||||
) from e
|
||||
|
||||
|
||||
class _LeptonicaErrorTrap:
|
||||
class _LeptonicaErrorTrap_Redirect:
|
||||
"""
|
||||
Context manager to trap errors reported by Leptonica.
|
||||
Context manager to trap errors reported by Leptonica < 1.79 or on Apple Silicon.
|
||||
|
||||
Leptonica's error return codes don't provide much informatino about what
|
||||
Leptonica's error return codes don't provide much information about what
|
||||
went wrong. Leptonica does, however, write more detailed errors to stderr
|
||||
(provided this is not disabled at compile time). The Leptonica source
|
||||
code is very consistent in its use of macros to generate errors.
|
||||
@@ -58,20 +101,23 @@ class _LeptonicaErrorTrap:
|
||||
|
||||
"""
|
||||
|
||||
leptonica_lock = threading.Lock()
|
||||
|
||||
def __init__(self):
|
||||
self.tmpfile = None
|
||||
self.copy_of_stderr = -1
|
||||
self.no_stderr = False
|
||||
|
||||
def __enter__(self):
|
||||
from io import UnsupportedOperation
|
||||
|
||||
self.tmpfile = TemporaryFile()
|
||||
|
||||
# Save the old stderr, and redirect stderr to temporary file
|
||||
with suppress(AttributeError):
|
||||
sys.stderr.flush()
|
||||
self.leptonica_lock.acquire()
|
||||
try:
|
||||
# It would make sense to do sys.stderr.flush() here, but that can deadlock
|
||||
# due to https://bugs.python.org/issue6721. So don't flush. Pretend
|
||||
# there's nothing important in sys.stderr. If the user cared they would
|
||||
# be using Leptonica 1.79 or later anyway to avoid this mess.
|
||||
self.copy_of_stderr = os.dup(sys.stderr.fileno())
|
||||
os.dup2(self.tmpfile.fileno(), sys.stderr.fileno(), inheritable=False)
|
||||
except AttributeError:
|
||||
@@ -83,7 +129,10 @@ class _LeptonicaErrorTrap:
|
||||
os.dup2(self.tmpfile.fileno(), 2, inheritable=False)
|
||||
except UnsupportedOperation:
|
||||
self.copy_of_stderr = None
|
||||
return
|
||||
except Exception:
|
||||
self.leptonica_lock.release()
|
||||
raise
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
# Restore old stderr
|
||||
@@ -100,6 +149,8 @@ class _LeptonicaErrorTrap:
|
||||
self.tmpfile.seek(0) # Cursor will be at end, so move back to beginning
|
||||
leptonica_output = self.tmpfile.read().decode(errors='replace')
|
||||
self.tmpfile.close()
|
||||
self.leptonica_lock.release()
|
||||
|
||||
# If there are Python errors, record them
|
||||
if exc_type:
|
||||
logger.warning(leptonica_output)
|
||||
@@ -117,6 +168,70 @@ class _LeptonicaErrorTrap:
|
||||
return False
|
||||
|
||||
|
||||
tls = threading.local()
|
||||
tls.trap = None
|
||||
|
||||
|
||||
class _LeptonicaErrorTrap_Queue:
|
||||
def __init__(self):
|
||||
self.queue = deque()
|
||||
|
||||
def __enter__(self):
|
||||
self.queue.clear()
|
||||
tls.trap = self.queue
|
||||
|
||||
def __exit__(self, exc_type, exc_value, traceback):
|
||||
tls.trap = None
|
||||
output = ''.join(self.queue)
|
||||
self.queue.clear()
|
||||
|
||||
# If there are Python errors, record them
|
||||
if exc_type:
|
||||
logger.warning(output)
|
||||
|
||||
if 'Error' in output:
|
||||
if 'image file not found' in output:
|
||||
raise FileNotFoundError()
|
||||
elif 'pixWrite: stream not opened' in output:
|
||||
raise LeptonicaIOError()
|
||||
elif 'index not valid' in output:
|
||||
raise IndexError()
|
||||
elif 'pixGetInvBackgroundMap: w and h must be >= 5' in output:
|
||||
logger.warning(
|
||||
"Leptonica attempted to remove background from a low resolution - "
|
||||
"you may want to review in a PDF viewer"
|
||||
)
|
||||
else:
|
||||
raise LeptonicaError(output)
|
||||
return False
|
||||
|
||||
|
||||
try:
|
||||
|
||||
@ffi.callback("void(char *)")
|
||||
def _stderr_handler(cstr):
|
||||
msg = ffi.string(cstr).decode(errors='replace')
|
||||
if msg.startswith("Error"):
|
||||
logger.error(msg)
|
||||
elif msg.startswith("Warning"):
|
||||
logger.warning(msg)
|
||||
else:
|
||||
logger.debug(msg)
|
||||
if tls.trap is not None:
|
||||
tls.trap.append(msg)
|
||||
return
|
||||
|
||||
lept.leptSetStderrHandler(_stderr_handler)
|
||||
except (ffi.error, MemoryError):
|
||||
# Pre-1.79 Leptonica does not have leptSetStderrHandler
|
||||
# And some platforms, notably Apple ARM 64, do not allow the write+execute
|
||||
# memory needed to set up the callback function.
|
||||
_LeptonicaErrorTrap = _LeptonicaErrorTrap_Redirect
|
||||
else:
|
||||
# 1.79 have this new symbol
|
||||
_LeptonicaErrorTrap = _LeptonicaErrorTrap_Queue
|
||||
|
||||
|
||||
class LeptonicaError(Exception):
|
||||
pass
|
||||
|
||||
@@ -282,7 +397,7 @@ class Pix(LeptonicaObject):
|
||||
|
||||
@classmethod
|
||||
def read(cls, path):
|
||||
warnings.warn('Use Pix.open() instead', DeprecationWarning)
|
||||
warn('Use Pix.open() instead', DeprecationWarning)
|
||||
return cls.open(path)
|
||||
|
||||
@classmethod
|
||||
@@ -292,9 +407,11 @@ class Pix(LeptonicaObject):
|
||||
Leptonica can load TIFF, PNM (PBM, PGM, PPM), PNG, and JPEG. If
|
||||
loading fails then the object will wrap a C null pointer.
|
||||
"""
|
||||
filename = fspath(path)
|
||||
with _LeptonicaErrorTrap():
|
||||
return cls(lept.pixRead(os.fsencode(filename)))
|
||||
with open(path, 'rb') as py_file:
|
||||
data = py_file.read()
|
||||
buffer = ffi.from_buffer(data)
|
||||
with _LeptonicaErrorTrap():
|
||||
return cls(lept.pixReadMem(buffer, len(buffer)))
|
||||
|
||||
def write_implied_format(self, path, jpeg_quality=0, jpeg_progressive=0):
|
||||
"""Write pix to the filename, with the extension indicating format.
|
||||
@@ -302,14 +419,22 @@ class Pix(LeptonicaObject):
|
||||
jpeg_quality -- quality (iff JPEG; 1 - 100, 0 for default)
|
||||
jpeg_progressive -- (iff JPEG; 0 for baseline seq., 1 for progressive)
|
||||
"""
|
||||
filename = fspath(path)
|
||||
with _LeptonicaErrorTrap():
|
||||
lept.pixWriteImpliedFormat(
|
||||
os.fsencode(filename), self._cdata, jpeg_quality, jpeg_progressive
|
||||
)
|
||||
lept_format = lept.getImpliedFileFormat(os.fsencode(path))
|
||||
with open(path, 'wb') as py_file:
|
||||
data = ffi.new('l_uint8 **pdata')
|
||||
size = ffi.new('size_t *psize')
|
||||
with _LeptonicaErrorTrap():
|
||||
if lept_format == lept.L_JPEG_ENCODE:
|
||||
lept.pixWriteMemJpeg(
|
||||
data, size, self._cdata, jpeg_quality, jpeg_progressive
|
||||
)
|
||||
else:
|
||||
lept.pixWriteMem(data, size, self._cdata, lept_format)
|
||||
buffer = ffi.buffer(data[0], size[0])
|
||||
py_file.write(buffer)
|
||||
|
||||
@classmethod
|
||||
def frompil(self, pillow_image):
|
||||
def frompil(cls, pillow_image):
|
||||
"""Create a copy of a PIL.Image from this Pix"""
|
||||
bio = BytesIO()
|
||||
pillow_image.save(bio, format='png', compress_level=1)
|
||||
@@ -321,7 +446,7 @@ class Pix(LeptonicaObject):
|
||||
|
||||
def topil(self):
|
||||
"""Returns a PIL.Image version of this Pix"""
|
||||
from PIL import Image
|
||||
from PIL import Image # pylint: disable=import-outside-toplevel
|
||||
|
||||
# Leptonica manages data in words, so it implicitly does an endian
|
||||
# swap. Tell Pillow about this when it reads the data.
|
||||
@@ -492,27 +617,15 @@ class Pix(LeptonicaObject):
|
||||
)
|
||||
return Pix(thresh_pix)
|
||||
|
||||
def crop_to_foreground(
|
||||
self,
|
||||
threshold=128,
|
||||
mindist=70,
|
||||
erasedist=30,
|
||||
pagenum=0,
|
||||
showmorph=0,
|
||||
display=0,
|
||||
pdfdir=ffi.NULL,
|
||||
):
|
||||
def crop_to_foreground(self, threshold=128, mindist=70, erasedist=30, showmorph=0):
|
||||
if get_leptonica_version() < 'leptonica-1.76':
|
||||
# Leptonica 1.76 changed the API for pixFindPageForeground; we don't
|
||||
# support the old version
|
||||
raise LeptonicaError("Not available in this version of Leptonica")
|
||||
with _LeptonicaErrorTrap():
|
||||
cropbox = Box(
|
||||
lept.pixFindPageForeground(
|
||||
self._cdata,
|
||||
threshold,
|
||||
mindist,
|
||||
erasedist,
|
||||
pagenum,
|
||||
showmorph,
|
||||
display,
|
||||
pdfdir,
|
||||
self._cdata, threshold, mindist, erasedist, showmorph, ffi.NULL
|
||||
)
|
||||
)
|
||||
|
||||
@@ -549,6 +662,9 @@ class Pix(LeptonicaObject):
|
||||
bg_val=200,
|
||||
smooth_kernel=(2, 1),
|
||||
):
|
||||
if self.width < tile_size[0] or self.height < tile_size[1]:
|
||||
logger.info("Skipped pixMaskedThreshOnBackgroundNorm on small image")
|
||||
return self
|
||||
# Background norm doesn't work on color mapped Pix, so remove colormap
|
||||
target_pix = self.remove_colormap(lept.REMOVE_CMAP_BASED_ON_SRC)
|
||||
with _LeptonicaErrorTrap():
|
||||
@@ -827,6 +943,8 @@ def get_leptonica_version():
|
||||
Caveat: Leptonica expects the caller to free this memory. We don't,
|
||||
since that would involve binding to libc to access libc.free(),
|
||||
a pointless effort to reclaim 100 bytes of memory.
|
||||
|
||||
Reminder that this returns "leptonica-1.xx" or "leptonica-1.yy.0".
|
||||
"""
|
||||
return ffi.string(lept.getLeptonicaVersion()).decode()
|
||||
|
||||
|
||||
@@ -1,18 +1,8 @@
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
"""Bindings to external libraries"""
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -1,20 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
# © 2017 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from cffi import FFI
|
||||
|
||||
@@ -74,6 +66,17 @@ struct Pixa
|
||||
};
|
||||
typedef struct Pixa PIXA;
|
||||
|
||||
/*! Array of compressed pix */
|
||||
struct PixaComp
|
||||
{
|
||||
l_int32 n; /*!< number of PixComp in ptr array */
|
||||
l_int32 nalloc; /*!< number of PixComp ptrs allocated */
|
||||
l_int32 offset; /*!< indexing offset into ptr array */
|
||||
struct PixComp **pixc; /*!< the array of ptrs to PixComp */
|
||||
struct Boxa *boxa; /*!< array of boxes */
|
||||
};
|
||||
typedef struct PixaComp PIXAC;
|
||||
|
||||
struct Box
|
||||
{
|
||||
l_int32 x;
|
||||
@@ -210,9 +213,15 @@ ffibuilder.cdef(
|
||||
"""
|
||||
PIX * pixRead ( const char *filename );
|
||||
PIX * pixReadMem ( const l_uint8 *data, size_t size );
|
||||
PIX * pixReadStream ( FILE *fp, l_int32 hint );
|
||||
PIX * pixScale ( PIX *pixs, l_float32 scalex, l_float32 scaley );
|
||||
l_int32 pixFindSkew ( PIX *pixs, l_float32 *pangle, l_float32 *pconf );
|
||||
l_int32 pixWriteImpliedFormat ( const char *filename, PIX *pix, l_int32 quality, l_int32 progressive );
|
||||
l_int32 getImpliedFileFormat ( const char *filename );
|
||||
l_ok pixWriteStream ( FILE *fp, PIX *pix, l_int32 format );
|
||||
l_ok pixWriteStreamJpeg ( FILE *fp, PIX *pixs, l_int32 quality, l_int32 progressive );
|
||||
l_ok pixWriteMem ( l_uint8 **pdata, size_t *psize, PIX *pix, l_int32 format );
|
||||
l_ok pixWriteMemJpeg ( l_uint8 **pdata, size_t *psize, PIX *pix, l_int32 quality, l_int32 progressive );
|
||||
l_int32
|
||||
pixWriteMemPng(l_uint8 **pdata,
|
||||
size_t *psize,
|
||||
@@ -294,14 +303,12 @@ pixCleanBackgroundToWhite(PIX *pixs,
|
||||
l_int32 whiteval);
|
||||
|
||||
BOX *
|
||||
pixFindPageForeground(PIX *pixs,
|
||||
l_int32 threshold,
|
||||
l_int32 mindist,
|
||||
l_int32 erasedist,
|
||||
l_int32 pagenum,
|
||||
l_int32 showmorph,
|
||||
l_int32 display,
|
||||
const char *pdfdir);
|
||||
pixFindPageForeground ( PIX *pixs,
|
||||
l_int32 threshold,
|
||||
l_int32 mindist,
|
||||
l_int32 erasedist,
|
||||
l_int32 showmorph,
|
||||
PIXAC *pixac );
|
||||
|
||||
PIX *
|
||||
pixClipRectangle(PIX *pixs,
|
||||
@@ -414,7 +421,10 @@ pixExtractBarcodes(PIX *pixs,
|
||||
l_int32 debugflag);
|
||||
|
||||
BOXA *
|
||||
pixLocateBarcodes ( PIX *pixs, l_int32 thresh, PIX **ppixb, PIX **ppixm );
|
||||
pixLocateBarcodes ( PIX *pixs,
|
||||
l_int32 thresh,
|
||||
PIX **ppixb,
|
||||
PIX **ppixm );
|
||||
|
||||
SARRAY *
|
||||
pixReadBarcodes(PIXA *pixa,
|
||||
@@ -423,6 +433,12 @@ pixReadBarcodes(PIXA *pixa,
|
||||
SARRAY **psaw,
|
||||
l_int32 debugflag);
|
||||
|
||||
PIX *
|
||||
pixGenHalftoneMask(PIX *pixs,
|
||||
PIX **ppixtext,
|
||||
l_int32 *phtfound,
|
||||
PIXA *pixadb);
|
||||
|
||||
l_int32
|
||||
l_generateCIDataForPdf(const char *fname,
|
||||
PIX *pix,
|
||||
@@ -483,6 +499,8 @@ void selDestroy ( SEL **psel );
|
||||
l_int32
|
||||
setMsgSeverity(l_int32 newsev);
|
||||
|
||||
void
|
||||
leptSetStderrHandler(void (*handler)(const char *));
|
||||
"""
|
||||
)
|
||||
|
||||
@@ -491,3 +509,8 @@ ffibuilder.set_source("ocrmypdf.lib._leptonica", None)
|
||||
|
||||
if __name__ == '__main__':
|
||||
ffibuilder.compile(verbose=True)
|
||||
if Path('ocrmypdf/lib/_leptonica.py').exists() and Path('src/ocrmypdf').exists():
|
||||
output = Path('ocrmypdf/lib/_leptonica.py')
|
||||
output.rename('src/ocrmypdf/lib/_leptonica.py')
|
||||
Path('ocrmypdf/lib').rmdir()
|
||||
Path('ocrmypdf').rmdir()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user