Files
OCRmyPDF/.docker/Dockerfile
2021-11-13 01:57:17 -08:00

81 lines
1.8 KiB
Docker

# OCRmyPDF
#
FROM ubuntu:21.04 as base
ENV LANG=C.UTF-8
ENV TZ=UTC
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
libqpdf-dev \
zlib1g \
liblept5
FROM base as builder
# Note we need leptonica here to build jbig2
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential autoconf automake libtool \
libleptonica-dev \
zlib1g-dev \
python3-dev \
python3-distutils \
libffi-dev \
ca-certificates \
curl \
git
# Get the latest pip (Ubuntu version doesn't support manylinux2010)
RUN \
curl https://bootstrap.pypa.io/get-pip.py | python3
# Compile and install jbig2
# Needs libleptonica-dev, zlib1g-dev
RUN \
mkdir jbig2 \
&& curl -L https://github.com/agl/jbig2enc/archive/ea6a40a.tar.gz | \
tar xz -C jbig2 --strip-components=1 \
&& cd jbig2 \
&& ./autogen.sh && ./configure && make && make install \
&& cd .. \
&& rm -rf jbig2
COPY . /app
WORKDIR /app
RUN pip3 install --no-cache-dir .[test,webservice,watcher]
FROM base
RUN apt-get update && apt-get install -y --no-install-recommends \
ghostscript \
img2pdf \
libsm6 libxext6 libxrender-dev \
pngquant \
tesseract-ocr \
tesseract-ocr-chi-sim \
tesseract-ocr-deu \
tesseract-ocr-eng \
tesseract-ocr-fra \
tesseract-ocr-por \
tesseract-ocr-spa \
unpaper \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY --from=builder /usr/local/lib/ /usr/local/lib/
COPY --from=builder /usr/local/bin/ /usr/local/bin/
COPY --from=builder /app/misc/webservice.py /app/
COPY --from=builder /app/misc/watcher.py /app/
# Copy minimal project files to get the test suite.
COPY --from=builder /app/setup.cfg /app/setup.py /app/README.md /app/
COPY --from=builder /app/tests /app/tests
ENTRYPOINT ["/usr/local/bin/ocrmypdf"]