Dockerfile 4.19 KB

FROM ubuntu:14.04

# Cópia de arquivos do projeto OCR-SERVER
COPY usr/local/bin/ocr /usr/local/bin/ocr
COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr
COPY entrypoint.sh /entrypoint.sh

WORKDIR /tmp

# Instalação dos pacotes pré-requisitos do ocr-server 2
RUN apt-get -y update && \
    apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \
    curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \
    zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \
    wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \
    libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \
    libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \
    rsyslog libsys-syslog-perl && \
    apt-get -y clean all

RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \
    dpkg -i mscorefonts.deb && \
    rm mscorefonts.deb

# Instalação do Perl 5.1 e demais módulos
RUN perl -MCPAN -e 'install File::Touch'
RUN perl -MCPAN -e 'install File::Find::Rule;'
RUN perl -MCPAN -e 'install File::Touch;'
RUN perl -MCPAN -e 'install Sys::Syslog;'
RUN perl -MCPAN -e 'install IPC::Open3;'
RUN perl -MCPAN -e 'install IO::Select;'

# Tesseract-ocr 3.05, com dicionários inglês e português
# Bibliotecas para o Tesseract: Leptonica
RUN git clone https://github.com/DanBloomberg/leptonica.git && \
    cd leptonica && ./autobuild && ./configure && make all install && \
    rm -rf ../leptonica

# Bibliotecas para o Tesseract: Libav
RUN git clone https://github.com/libav/libav.git && \
    export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \
    cd libav && ./configure --enable-sram && make all install && \
    rm -rf ../libav

# Tesseract 3.05.01
RUN git clone https://github.com/tesseract-ocr/tesseract.git && \
    cd tesseract && ./autogen.sh && ./configure && make all install && \
    rm -rf ../tesseract

RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \
    wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \
    wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata

# Poppler 0.56
RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \
    cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make  all install  && \
    rm -rf ../poppler

# pdftk, versão 2.02 ou superior
RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \
    unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \
    cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \
    rm -rf ../pdftk-2.02-dist

# Ghostscript 9.18 ou superior
RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \
    tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \
    cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \
    rm -rf ../ghostscript-9.18

# CPDF Intel OS X v 2.2
RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \
    cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin

# Atualização das configurações do ld
RUN ldconfig

RUN useradd -m ocr

RUN chmod +x /usr/local/bin/ocr && \
    chmod +x /etc/init.d/ocr && \
    update-rc.d ocr defaults

RUN mkdir /var/ocr-server/  && \
    mkdir -p /var/ocr-server/Entrada && \
    mkdir -p /var/ocr-server/Saida && \
    mkdir -p /var/ocr-server/Originais_Processados && \
    mkdir -p /var/ocr-server/Erro  && \
    chmod +x /entrypoint.sh

RUN mkdir -p /tmp/ocr_dev/ && \
    mkdir -p /tmp/ocr_dev/Entrada && \
    mkdir -p /tmp/ocr_dev/Saida && \
    mkdir -p /tmp/ocr_dev/Originais_Processados && \
    mkdir -p /tmp/ocr_dev/Erro && \
    chmod -R 777 /tmp/ocr_dev

WORKDIR /

VOLUME /var/ocr-server/

CMD ["bash", "/entrypoint.sh"]