Commit cb6225a47553f56ff1a7086c544b0eccf14f61d9

Authored by Guilherme Andrade Del Cantoni
1 parent bcbd6d9a

Implementação do container Docker para provisionamento do ocr-server

Showing 3 changed files with 89 additions and 28 deletions   Show diff stats
1 -FROM ubuntu  
2 1
  2 +FROM ubuntu:14.04
  3 +
  4 +# Cópia de arquivos do projeto OCR-SERVER
3 COPY usr/local/bin/ocr /usr/local/bin/ocr 5 COPY usr/local/bin/ocr /usr/local/bin/ocr
4 COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr 6 COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr
5 COPY entrypoint.sh /entrypoint.sh 7 COPY entrypoint.sh /entrypoint.sh
6 8
7 RUN useradd -m ocr 9 RUN useradd -m ocr
8 10
9 -RUN apt-get -y update && \  
10 - apt-get -y install libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick tesseract-ocr \  
11 - gettext tesseract-ocr-por tesseract-ocr-eng pdftk poppler-utils unpaper git build-essential 11 +RUN apt-get -y update
  12 +RUN apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils
  13 +RUN apt-get -y install curl
  14 +RUN apt-get -y install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev
  15 +#RUN apt-get -y install ttf-mscorefonts-installer
  16 +
  17 +RUN apt-get -y install wget cabextract xfonts-utils
  18 +RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \
  19 + dpkg -i mscorefonts.deb
  20 +
  21 +# Instalação do Perl 5.1 e demais módulos
  22 +RUN apt-get -y install perl
  23 +RUN perl -MCPAN -e 'install File::Touch'
  24 +RUN perl -MCPAN -e 'install File::Find::Rule;'
  25 +RUN perl -MCPAN -e 'install File::Touch;'
  26 +RUN perl -MCPAN -e 'install Sys::Syslog;'
  27 +RUN perl -MCPAN -e 'install IPC::Open3;'
  28 +RUN perl -MCPAN -e 'install IO::Select;'
  29 +
  30 +# Tesseract-ocr 3.05, com dicionários inglês e português
  31 +# Bibliotecas para o Tesseract: Leptonica
  32 +RUN git clone https://github.com/DanBloomberg/leptonica.git
  33 +RUN apt-get -y install automake
  34 +RUN cd leptonica && ./autobuild && ./configure && make all install
  35 +
  36 +# Bibliotecas para o Tesseract: Libav
  37 +RUN git clone https://github.com/libav/libav.git
  38 +RUN export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \
  39 + cd libav && ./configure --enable-sram && make all install -w
  40 +
  41 +# Tesseract
  42 +RUN git clone https://github.com/tesseract-ocr/tesseract.git
  43 +RUN apt-get -y install autoconf-archive
  44 +RUN cd tesseract && ./autogen.sh && ./configure && make all install
  45 +
  46 +#RUN git clone https://github.com/tesseract-ocr/tessdata.git
  47 +#RUN cp -avR tessdata/* /usr/local/share/tessdata/
  48 +RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \
  49 + wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \
  50 + wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata
  51 +
  52 +# export LIBCURL_LIBS=/usr/lib/x86_64-linux-gnu/ && \
  53 +RUN apt-get -y install libcurl4-gnutls-dev
  54 +RUN git clone https://anongit.freedesktop.org/git/poppler/poppler.git && \
  55 + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd ..
  56 +
  57 +# pdftk, versão 2.02 ou superior
  58 +RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
  59 +RUN apt-get -y install unzip libgcj14
  60 +RUN unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip
  61 +
  62 +RUN apt-get -y update
  63 +RUN apt-get -y install build-essential libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick
  64 +RUN apt-get -y install gettext unpaper git
  65 +RUN apt-get -y install libtiff5 libpng12-0 libjpeg-turbo8 zlib1g libpango1.0-0 libcairo2 fontconfig libwebp5
  66 +RUN apt-get -y install libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0
  67 +RUN apt-get -y install pkg-config libgcj14 libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev
  68 +RUN apt-get -y install zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev
  69 +
  70 +RUN apt-get install -y gcc
  71 +RUN apt-get install -y gcj-jdk
  72 +RUN cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install
  73 +
  74 +# Ghostscript 9.18 ou superior
  75 +RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz
  76 +RUN tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz
  77 +RUN cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install
12 78
13 RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \ 79 RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \
14 cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin 80 cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin
15 81
16 -RUN perl -MCPAN -e 'install File::Touch' && \  
17 - perl -MCPAN -e 'install File::Find::Rule;' && \  
18 - perl -MCPAN -e 'install File::Touch;' && \  
19 - perl -MCPAN -e 'install Sys::Syslog;' && \  
20 - perl -MCPAN -e 'install IPC::Open3;' && \  
21 - perl -MCPAN -e 'install IO::Select;'  
22 -  
23 RUN chmod +x /usr/local/bin/ocr && \ 82 RUN chmod +x /usr/local/bin/ocr && \
24 chmod +x /etc/init.d/ocr && \ 83 chmod +x /etc/init.d/ocr && \
25 update-rc.d ocr defaults 84 update-rc.d ocr defaults
@@ -28,18 +87,16 @@ RUN mkdir /var/ocr-server/ && \ @@ -28,18 +87,16 @@ RUN mkdir /var/ocr-server/ && \
28 mkdir -p /var/ocr-server/Entrada && \ 87 mkdir -p /var/ocr-server/Entrada && \
29 mkdir -p /var/ocr-server/Saida && \ 88 mkdir -p /var/ocr-server/Saida && \
30 mkdir -p /var/ocr-server/Originais_Processados && \ 89 mkdir -p /var/ocr-server/Originais_Processados && \
31 - mkdir -p /var/ocr-server/Erro  
32 -RUN chmod +x entrypoint.sh 90 + mkdir -p /var/ocr-server/Erro && \
  91 + chmod +x entrypoint.sh
33 92
34 -RUN ln -s /usr/bin/pdftk /usr/local/bin/pdftk && \  
35 - ln -s /usr/bin/pdfimages /usr/local/bin/pdfimages && \  
36 - ln -s /usr/bin/tesseract /usr/local/bin/tesseract && \  
37 - ln -s /usr/bin/pdfinfo /usr/local/bin/pdfinfo && \  
38 - ln -s /usr/bin/pdffonts /usr/local/bin/pdffonts && \  
39 - ln -s /usr/bin/pdftoppm /usr/local/bin/pdftoppm && \  
40 - ln -s /usr/bin/cpdf /usr/local/bin/cpdf 93 +RUN mkdir -p /tmp/ocr_dev/ && \
  94 + mkdir -p /tmp/ocr_dev/Entrada && \
  95 + mkdir -p /tmp/ocr_dev/Saida && \
  96 + mkdir -p /tmp/ocr_dev/Originais_Processados && \
  97 + mkdir -p /tmp/ocr_dev/Erro && \
  98 + chmod -R 777 /tmp/ocr_dev
41 99
42 VOLUME /var/ocr-server/ 100 VOLUME /var/ocr-server/
43 101
44 -CMD ["bash", "entrypoint.sh"]  
45 - 102 +CMD ["bash", "entrypoint.sh"]
46 \ No newline at end of file 103 \ No newline at end of file
1 #!/usr/bin/env bash 1 #!/usr/bin/env bash
2 2
3 -mkdir /var/ocr-server/  
4 -mkdir /var/ocr-server/Entrada  
5 -mkdir /var/ocr-server/Saida  
6 -mkdir /var/ocr-server/Originais_Processados  
7 -mkdir /var/ocr-server/Erro 3 +mkdir -p /var/ocr-server/
  4 +mkdir -p /var/ocr-server/Entrada
  5 +mkdir -p /var/ocr-server/Saida
  6 +mkdir -p /var/ocr-server/Originais_Processados
  7 +mkdir -p /var/ocr-server/Erro
8 chmod -R 777 /var/ocr-server 8 chmod -R 777 /var/ocr-server
9 9
10 service ocr start 10 service ocr start
usr/local/bin/ocr
@@ -77,7 +77,11 @@ use Sys::Hostname; @@ -77,7 +77,11 @@ use Sys::Hostname;
77 use IPC::Open3; 77 use IPC::Open3;
78 use IO::Select; 78 use IO::Select;
79 79
80 -my $DEBUG = 2; 80 +use Getopt::Std;
  81 +use vars qw( $opt_d );
  82 +
  83 +#my $DEBUG = 2
  84 +my $DEBUG = ($opt_d ? 2 : 0);
81 my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); 85 my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`);
82 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; 86 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ;
83 87