Merge branch 'Pre_versao_2.0' into 'master'

Pre versao 2.0 final para gerar Tag 2.0 Final See merge request !4

Merge branch 'Pre_versao_2.0' into 'master'
Pre versao 2.0 final para gerar Tag 2.0 Final See merge request !4
Nei Jobson da Costa Carneiro
2 parents 78ec197b a35873e6
Showing 7 changed files with 583 additions and 272 deletions Show diff stats
Dockerfile
INSTALL.txt
README.md
entrypoint.sh
usr/local/bin/ocr
workflow.pdf
workflow.vsd
@@ -0,0 +1,103 @@
+
+FROM ubuntu:14.04
+
+# Cópia de arquivos do projeto OCR-SERVER
+COPY usr/local/bin/ocr /usr/local/bin/ocr
+COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr
+COPY entrypoint.sh /entrypoint.sh
+
+WORKDIR /tmp
+
+# Instalação dos pacotes pré-requisitos do ocr-server 2
+RUN apt-get -y update && \
+    apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \
+    curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \
+    zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \
+    wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \
+    libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \
+    libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \
+    rsyslog libsys-syslog-perl && \
+    apt-get -y clean all
+
+RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \
+    dpkg -i mscorefonts.deb && \
+    rm mscorefonts.deb
+
+# Instalação do Perl 5.1 e demais módulos
+RUN perl -MCPAN -e 'install File::Touch'
+RUN perl -MCPAN -e 'install File::Find::Rule;'
+RUN perl -MCPAN -e 'install File::Touch;'
+RUN perl -MCPAN -e 'install Sys::Syslog;'
+RUN perl -MCPAN -e 'install IPC::Open3;'
+RUN perl -MCPAN -e 'install IO::Select;'
+
+# Tesseract-ocr 3.05, com dicionários inglês e português
+# Bibliotecas para o Tesseract: Leptonica
+RUN git clone https://github.com/DanBloomberg/leptonica.git && \
+    cd leptonica && ./autobuild && ./configure && make all install && \
+    rm -rf ../leptonica
+
+# Bibliotecas para o Tesseract: Libav
+RUN git clone https://github.com/libav/libav.git && \
+    export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \
+    cd libav && ./configure --enable-sram && make all install && \
+    rm -rf ../libav
+
+# Tesseract 3.05.01
+RUN git clone https://github.com/tesseract-ocr/tesseract.git && \
+    cd tesseract && ./autogen.sh && ./configure && make all install && \
+    rm -rf ../tesseract
+
+RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \
+    wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \
+    wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata
+
+# Poppler 0.56
+RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \
+    cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make  all install  && \
+    rm -rf ../poppler
+
+# pdftk, versão 2.02 ou superior
+RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \
+    unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \
+    cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \
+    rm -rf ../pdftk-2.02-dist
+
+# Ghostscript 9.18 ou superior
+RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \
+    tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \
+    cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \
+    rm -rf ../ghostscript-9.18
+
+# CPDF Intel OS X v 2.2
+RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \
+    cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin
+
+# Atualização das configurações do ld
+RUN ldconfig
+
+RUN useradd -m ocr
+
+RUN chmod +x /usr/local/bin/ocr && \
+    chmod +x /etc/init.d/ocr && \
+    update-rc.d ocr defaults
+
+RUN mkdir /var/ocr-server/  && \
+    mkdir -p /var/ocr-server/Entrada && \
+    mkdir -p /var/ocr-server/Saida && \
+    mkdir -p /var/ocr-server/Originais_Processados && \
+    mkdir -p /var/ocr-server/Erro  && \
+    chmod +x /entrypoint.sh
+
+RUN mkdir -p /tmp/ocr_dev/ && \
+    mkdir -p /tmp/ocr_dev/Entrada && \
+    mkdir -p /tmp/ocr_dev/Saida && \
+    mkdir -p /tmp/ocr_dev/Originais_Processados && \
+    mkdir -p /tmp/ocr_dev/Erro && \
+    chmod -R 777 /tmp/ocr_dev
+
+WORKDIR /
+
+VOLUME /var/ocr-server/
+
+CMD ["bash", "/entrypoint.sh"]
 \ No newline at end of file
@@ -1,202 +0,0 @@
-#	OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees
-#
-#	This script monitors a set of input directories for PDF files
-#	once a new file is detected, it is processes through tesseract OCR
-#	in order to generate a new file with a hidden searchable text layer
-#
-#	It may be distributed under the conditions of the LGPL v2.1 license.
-#
-#	Author: Guilherme Chehab 
-#
-#	Version History:
-#	0.1	Initial single server version
-#	0.2	Check if page already has the html hidden layer, if so, ignore it
-#	0.3	Solved issues about various image enconding types
-#	0.4	Added a postnormalization step to ensure all output pdf pages have
-#		the same size and orientations as the original files
-#	0.5	Used input file renaming as a way to sync multiple parallel instances,
-#		that way, it is minimized the risk of same file being OCRed multiple times.
-#	0.6	Added a default handler for unknown image encoding using jpeg encoding	
-#	0.7	Solved an issue with files with more than 1000 pages
-#	1.0	First release version
-#	1.0.1	Solving error when file has no images
-#       1.0.2   Fix bug when counting cores for AMD processors
-#       1.0.3   Added better image type detection
-#       1.0.4   Fix: added ubuntu init script
-#       1.0.4b  Centos 6.9
-#
-#       TODO:   - Changes get_imgs and OCR processing to enable pages with more than one image -- it
-#               would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
-#               diferently but does not treat it adequately
-#               - Review poppler and cpdf install instructions
-#               - Add better handling of vectorized and non scanned pdf files
-#               - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
-#               - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
-#
-#	Check software requirements on the comments bellow
-#
-#	To configure input dirs change @BASE_DIRS and @SUB_DIRS variables
-#
-#
-# O servidor OCR depende dos seguintes componentes:
-# - Perl 5.10.1, com seguintes módulos:
-#	- File::Find::Rule
-#	- File::Basename
-#	- File::Copy
-#	- File::Path
-#	- File::Touch
-#	- Sys::Syslog
-#	- Sys::Hostname
-#	- IPC::Open3
-#	- IO::Select
-#	- POSIX
-# - Tesseract-ocr 3.05, com dicionários inglês e português
-# - Pdftk 2.02
-# - Poppler-utils 0.42.0
-# - Cpdf 2.1
-# - ImageMagick 6.7.2-7
-#	
-# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
-# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
-# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
-#
-## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
-#
-# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
-#
-#	    @BASE_DIRS:	Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script 
-#	    @SUB_DIRS:		Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
-#	    $MAX_FILES:	Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
-#	    $MAX_PGS:		Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
-#	 Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
-#	 Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
-#	 A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
-#
-#	 Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
-#
-#
-# -----------------------  COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root)
-#
-#
-# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS 
-#
-# RedHat 6.7 e Centos 6.9:
-yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
-yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel
-cd /tmp
-wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
-rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
-rm -f msttcore-fonts-2.0-3.noarch.rpm
-
-# Centos 6.9
-#   \_ autoconf-archive
-wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
-rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
-rm autoconf-archive-2012.04.07-7.3.noarch.rpm
-#   \_ GCC 4.8
-wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
-yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
-
-# Ubuntu 14.04 Server:
-apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 
-apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev
-apt-get install ttf-mscorefonts-installer
-
-# Ambas plataformas:
-cd /usr/local/src
-
-for i in \
-	https://github.com/tesseract-ocr/langdata.git \
-	https://github.com/DanBloomberg/leptonica.git \
-	https://github.com/libav/libav.git  \
-	https://github.com/tesseract-ocr/tessdata.git \
-	https://github.com/tesseract-ocr/tesseract.git \
-	git://git.freedesktop.org/git/poppler/poppler.git \
-	git://git.freedesktop.org/git/poppler/test.git \
-	https://github.com/Flameeyes/unpaper.git \
-	https://github.com/ocaml/ocaml.git \
-	https://gitlab.camlcity.org/gerd/lib-findlib.git \
-	https://github.com/johnwhitington/camlpdf.git \
-	https://github.com/johnwhitington/cpdf-source.git \
-; do git clone $i; done
-
-wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
-unzip pdftk-2.02-src.zip
-rm -f pdftk-2.02-src.zip
-
-# pdftk, versão 2.02 ou superior
-cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
-
-# Centos 6.9
-#   \_ Cria um novo shell usando o GCC 4.8 por default
-scl enable devtoolset-2 bash  
-
-# Tesseract, versão 3.05-dev ou superior
-# Bibliotecas para o Tesseract: Leptonica e Libav
-cd leptonica && ./autobuild && ./configure && make all install && cd ..
-
-# Para compilação do Tesseract após a compilação do leptonica
-export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
-
-cd libav && ./configure --enable-sram && make all install && cd ..
-
-# Tesseract
-cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
-cp -avR tessdata/* /usr/local/share/tessdata/
-
-# cpdf, versão 2.1 ou superior
-cd ocaml && ./configure && make world.opt && make install && cd ..
-mkdir -p /usr/local/man/man5
-# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente 
-cd lib-findlib  && ./configure && make all && make install && cd ..
-cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
-cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
-
-# poppler-utils, versão 0.42.0 ou superior
-cd poppler && ./autogen.sh && ./configure && make  all install && cd ..
-
-# Centos 6.9
-#   \_ Termina o shell usando o GCC 4.8 por default
-exit
-
-# -----------------------  INSTALAÇÃO (obs.: os comandos de devem ser executados como root)
-
-## Comandos adicionais para configuração do módulo:
-	
-# Criação do usuário
-adduser ocr
-
-# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
-cp ./usr/local/bin/ocr /usr/local/bin
-
-# Auto start (RedHat 6.7 e CentOs 6.9)
-cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr 
-mv /etc
-chkconfig --add ocr
-chkconfig --level 2345 ocr on
-
-# Auto start (Ubuntu 14.04)
-cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
-update-rd.d ocr defaults
-
-# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
-cd /home/ocr
-tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
-su
-
-# Copie o pacote para os outros servidores e extraia com:
-cd /
-tar xovzf pkg-ocr.tgz
-
-# Instalando pré-requisitos RUNTIME em servidores adicionais
-
-# Redhat 6.7 e CentOS 6.9
-yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp
-yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext 
-
-# Ubuntu 14.04
-apt-get install  libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 
-apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0
-
-# Inicie o serviço com
-service ocr start
@@ -0,0 +1,260 @@
+#	OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees
+
+This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer
+
+It may be distributed under the conditions of the LGPL v2.1 license.
+
+Author: Guilherme Chehab 
+
+##	Version History:
+ - 0.1
+ 	- Initial single server version
+ - 0.2
+ 	- Check if page already has the html hidden layer, if so, ignore it
+ - 0.3
+ 	- Solved issues about various image enconding types
+ - 0.4
+ 	- Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files
+ - 0.5
+ 	- Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times.
+ - 0.6
+ 	- Added a default handler for unknown image encoding using jpeg encoding
+ - 0.7
+ 	- Solved an issue with files with more than 1000 pages
+ - 1.0
+ 	- First release version
+ 	- 1.0.1   Solving error when file has no images
+ 	- 1.0.2   Fix bug when counting cores for AMD processors
+ 	- 1.0.3   Added better image type detection
+ 	- 1.0.4   Fix: added ubuntu init script
+ 	- 1.0.4b  Add Centos 6.9 install instructions
+ - 2.0
+ 	- PDF/A output, and better compression with ghostscript
+	- Rewritten image extration, processing and transformations process
+	- Check if input file is signed, in this case, does not change the file contents
+	- Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
+	- Use operating system packges by default
+	- Changed paths from external programs, instead of using full paths, uses first match from $PATH
+	- Check existence of external programs on path before running
+	- Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
+	- Fix: create subpaths on error folder
+	- Fix: trying to reduce overhead on temporary folder
+	 
+##	TODO:
+ - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
+ - Review poppler and cpdf install instructions
+ - Add better handling of vectorized and non scanned pdf files
+ - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers
+ - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W.
+ - Move all parameters to config file
+ - Add some job control web interface
+ - Add end user interface to submit files through web
+ - Add check external programs version requirements before running
+  
+##	BUGS:
+ - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages
+ 
+##	Requirements: 
+ - Perl 5.10.1, com seguintes módulos:
+	- File::Find::Rule
+	- File::Basename
+	- File::Copy
+	- File::Path
+	- File::Touch
+	- Sys::Syslog
+	- Sys::Hostname
+	- IPC::Open3
+	- IO::Select
+	- POSIX
+ - Tesseract-ocr 3.05, com dicionários inglês e português
+ - Pdftk 2.02
+ - Poppler-utils 0.42.0
+ - Cpdf 2.1
+ - ImageMagick 6.7.2-7
+ - Ghostcript 9.18
+
+Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
+
+Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
+
+Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
+
+ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
+
+### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
+
+- @BASE_DIRS:	Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script 
+- @SUB_DIRS:		Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
+- $MAX_FILES:	Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
+- $MAX_PGS:		Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
+
+Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
+
+Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
+
+A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
+
+Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
+
+
+# Container Docker
+
+    O OCR-Server também está disponível como um container Docker, permitindo o rápido provisionamento da solução em ambiente de produção. Todos os procedimento para construção da imagem do container podem ser encontrados no arquivo Dockerfile.
+	
+	Para execução do serviço, basta que o docker instalado no servidor e executar o seguinte comando:
+
+        docker run --name <NOME_CONTAINER> -d -v <DIRETORIO_BASE>:/var/ocr-server guilhermeadc/ocr-server
+
+    Onde:
+    --name : Nome atribuído à instância do container. Ex: ocr-server
+    -d : Indicação executar o container em background 
+    -v : Diretório de compartilhamento entre o servidor host e o container.
+         O parâmetro <DIRETORIO_BASE> deve ser substituído pelo diretório base para busca de arquivos.
+
+    Para vistualizar os logs de processamento do serviço, basta executar o seguinte comando:
+        docker logs <NOME_CONTAINER>
+
+
+# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root)
+
+Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial) 
+são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada
+
+Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries
+
+## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS 
+
+    # RedHat 6.7 e Centos 6.9:
+	yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
+	yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel
+	cd /tmp
+	wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
+	rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
+	rm -f msttcore-fonts-2.0-3.noarch.rpm
+
+    # Centos 6.9
+    #   \_ autoconf-archive
+	wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
+	rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
+	rm autoconf-archive-2012.04.07-7.3.noarch.rpm
+    #   \_ GCC 4.8
+	wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
+	yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
+
+    # Ubuntu 14.04 Server:
+	apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 
+	apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev  libnss3-dev
+	apt-get install ttf-mscorefonts-installer
+
+    # Ambas plataformas:
+	cd /usr/local/src
+
+	for i in \
+		https://github.com/tesseract-ocr/langdata.git \
+		https://github.com/DanBloomberg/leptonica.git \
+		https://github.com/libav/libav.git  \
+		https://github.com/tesseract-ocr/tessdata.git \
+		https://github.com/tesseract-ocr/tesseract.git \
+		git://git.freedesktop.org/git/poppler/poppler.git \
+		git://git.freedesktop.org/git/poppler/test.git \
+		https://github.com/Flameeyes/unpaper.git \
+		https://github.com/ocaml/ocaml.git \
+		https://gitlab.camlcity.org/gerd/lib-findlib.git \
+		https://github.com/johnwhitington/camlpdf.git \
+		https://github.com/johnwhitington/cpdf-source.git \
+		http://git.ghostscript.com/ghostpdl.git \
+	; do git clone $i; done
+
+	wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
+	unzip pdftk-2.02-src.zip
+	rm -f pdftk-2.02-src.zip
+
+    # pdftk, versão 2.02 ou superior
+    cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
+
+    # Ghostscript 9.18 ou superior
+    #wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.21.tar.gz
+    #tar xvozf ghostscript-9.21.tar.gz
+    #rm -f ghostscript-9.21.tar.gz
+    #cd ghostscript-9.21
+    cd ghostpdl
+    ./autogen.sh; ./configure
+    make all install
+    cd ..
+
+    # Centos 6.9
+    #   \_ Cria um novo shell usando o GCC 4.8 por default
+    scl enable devtoolset-2 bash  
+
+    # Tesseract, versão 3.05-dev ou superior
+    # Bibliotecas para o Tesseract: Leptonica e Libav
+    cd leptonica && ./autobuild && ./configure && make all install && cd ..
+
+    # Para compilação do Tesseract após a compilação do leptonica
+    export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
+
+    cd libav && ./configure --enable-sram && make all install && cd ..
+
+    # Tesseract
+    cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
+    cp -avR tessdata/* /usr/local/share/tessdata/
+
+    # cpdf, versão 2.1 ou superior
+    cd ocaml && ./configure && make world.opt && make install && cd ..
+    mkdir -p /usr/local/man/man5
+    # lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente 
+    cd lib-findlib  && ./configure && make all && make install && cd ..
+    cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
+    cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
+
+    # poppler-utils, versão 0.42.0 ou superior
+    cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make  all install && cd ..
+
+    # Centos 6.9
+    #   \_ Termina o shell usando o GCC 4.8 por default
+    exit
+
+
+## Comandos adicionais para configuração do módulo:
+	
+    # Criação do usuário
+    adduser ocr
+
+    # Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
+    cp ./usr/local/bin/ocr /usr/local/bin
+
+    # Auto start (RedHat 6.7 e CentOs 6.9)
+    cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr 
+    mv /etc
+    chkconfig --add ocr
+    chkconfig --level 2345 ocr on
+    
+    # Auto start (Ubuntu 14.04)
+    cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
+    update-rd.d ocr defaults
+    
+    # Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
+    cd /home/ocr
+    tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
+    su
+
+# INSTALAÇÃO (obs.: os comandos devem ser executados como root)
+    # Criação do usuário
+    adduser ocr
+
+    # Copie o pacote para os outros servidores e extraia com:
+    cd /
+    tar xovzf pkg-ocr.tgz
+
+    # Instalando pré-requisitos RUNTIME em servidores adicionais
+
+    # Redhat 6.7 e CentOS 6.9
+    yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript
+    yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext 
+
+    # Ubuntu 14.04
+    apt-get install  libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 
+    apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript
+
+# Inicie o serviço com
+    service ocr start
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# Inicializa serviço de log
+/etc/init.d/rsyslog start
+
+# Cria estrutura de pastas para monitoramento de arquivos
+mkdir -p /var/ocr-server/
+mkdir -p /var/ocr-server/Entrada
+mkdir -p /var/ocr-server/Saida
+mkdir -p /var/ocr-server/Originais_Processados
+mkdir -p /var/ocr-server/Erro
+chmod -R 777 /var/ocr-server
+
+# Iniciar serviço do OCR-Server
+service ocr start
+
+tail -f /var/log/syslog
 \ No newline at end of file
-#! /usr/bin/perl -w
+#!/usr/bin/perl -w
 #
-#	OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes
+#	OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes
 #
 #	This script monitors a set of input directories for PDF files
 #	once a new file is detected, it is processes through tesseract OCR
@@ -24,15 +24,38 @@
 #	1.0.1	Solving error when file has no images
 #	1.0.2	Fix bug when counting cores for AMD processors
 #	1.0.3	Added better image type detection	
-#   1.0.4   Fix: added ubuntu init script
+#       1.0.4   Fix: added ubuntu init script
+#	1.0.4b	Add Centos 6.9 install instructions
+#	2.0	PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is 
+#		strongly recomended
+#		Rewritten image extration, processing and transformations process
+#		Check if input file is signed, in this case, does not change the file contents
+#		Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
+#		Use operating system packges by default
+#		Changed paths from external programs, instead of using full paths, uses first match from $PATH
+#		Check existence of external programs on path before running
+#		Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
+#		Fix: create subpaths on error folder
+#		Fix: trying to reduce overhead on temporary folder
 #
 #	TODO: 	- Changes get_imgs and OCR processing to enable pages with more than one image -- it
-#		would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
-#		diferently but does not treat it adequately
+#		would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them
+#		diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
 #		- Review poppler and cpdf install instructions
 #		- Add better handling of vectorized and non scanned pdf files
-#		- Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
-#		- Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
+#		- Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current 
+#		scalling, cropping and rotation handlers
+#		- Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- 
+#		added function to analyse image color histogram -> just need to add option to convert it to B&W.
+#		- Move all parameters to config file
+#		- Add some job control web interface
+#		- Add end user interface to submit files through web
+#		- Add check external programs version requirements before running
+#
+#	BUGS:	- When image is of type stencil or encoding image, cropping information is lost, and page is shown different than
+#		original, this is due to using pdftoppm instead of pdfimages 
+#		- Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions
+#		increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server
 #
 #	Check software requirements on the comments bellow
 #
@@ -55,7 +78,7 @@ use IPC::Open3;
 use IO::Select;
  
 my $DEBUG = 0;
-my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo  | grep -e '^processor' | wc -l`);
+my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo  | grep -e '^processor' | wc -l`);
 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ;
  
 my $USER = 'ocr';
@@ -63,39 +86,48 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca
  
 # Command dependencies
  
-# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher
-my $TESSERACT = '/usr/local/bin/tesseract -l por+eng';
+# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended
+my $TESSERACT = 'tesseract --oem 0'; 		# if Tesseract => 4.0
+#my $TESSERACT = 'tesseract';			# if Tesseract < 4.0
  
 # Depends on pdftk 2.02 or higher
-my $PDFTK = '/usr/local/bin/pdftk';
+my $PDFTK = 'pdftk';
  
 # Depends on poppler-utils 0.42.0 or higher
-#my $PDINFO = '/usr/local/bin/pdfinfo';
-my $PDFFONTS = '/usr/local/bin/pdffonts';
-my $PDFIMAGES = '/usr/local/bin/pdfimages';
-my $PDFTOPPM = '/usr/local/bin/pdftoppm';
+my $PDFFONTS = 'pdffonts';
+my $PDFIMAGES = 'pdfimages';
+my $PDFTOPPM = 'pdftoppm';
+my $PDFUNITE = 'pdfunite';
+my $PDFSIG = 'pdfsig';
  
 # Depends on cpdf 2.1 or higher
-my $CPDF = '/usr/local/bin/cpdf';
+my $CPDF = 'cpdf';
+
+# Depends on Ghostscript 9.18
+my $GS = 'gs';
  
 ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner
-my $CONVERT = '/usr/bin/convert';
+my $CONVERT = 'convert';
  
 # If it is needed further filtering
 #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';
  
-my @BASE_DIRS = (	'/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/',
-			'/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' );
+#my @BASE_DIRS = (	'/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/',
+#			'/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' );
+
+my @BASE_DIRS = ('/var/ocr-server/');
  
 my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' );
  
 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2);
-%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG);
+%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2);
  
 # Safeguard im case of cpuinfo has not identified correctly the number of CPUs 
 $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS;
  
-$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin';
+$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin';
+$ENV{'IFS'} = '\t\n';
+
 my ($host) = split/\./,hostname;
  
 use vars qw/*name *dir *prune/;
@@ -107,14 +139,15 @@ sub main;
 sub get_pages;
 sub get_rotation;
 sub get_res;
-sub is_ocred;
 sub is_locked_ex;
  
  
 my $expr = 'use POSIX qw(setsid)';
  
 my ($dumb1, $dumb2, $uid) = getpwnam ($USER);
-setuid ($uid) or warn "Cant set uid $uid";
+if (defined $uid) {
+	setuid ($uid) or warn "Cant set uid $uid";
+}
  
 $SIG{__DIE__}  = 'DEFAULT';
 $SIG{__WARN__} = \&die_when_called;
@@ -126,6 +159,11 @@ if ($@) {
 chdir('/') or die "$0: cannot chdir '/': $!\n";
 open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n";
  
+foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) {
+	die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0);
+}
+
+
 foreach my $DIR (@BASE_DIRS) {
  
     defined(my $pid = fork) or die "$0: cannot fork: $!\n";
@@ -135,7 +173,7 @@ foreach my $DIR (@BASE_DIRS) {
 	main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); 
 	exit 0;
 	last;
-    } 
+    }
 }
  
 exit 0;
@@ -157,7 +195,7 @@ sub main {
 	#  remove .tmp file
 	unlink ( find ( file => name =>  qr/\.${host}\.tmp$/i , in => ${IN} ) );
  
-	# Rename files that were in 'processig' back
+	# Rename files that were in 'processing' state back
 	foreach my $file ( find ( file => name =>  qr/\.${host}\.processing$/i , in => ${IN} ) ) {
 		my $old_name = $file;
 		$old_name =~ s/\.${host}\.processing$//g;
@@ -177,12 +215,14 @@ sub main {
 	# Main loop
 	while ( 1 ) {	
 		select (undef, undef, undef, rand 3); 	# Random sleep so multiple instances dont get synced
+
 		$files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name =>  qr/\.pdf$/i , in => ${IN} ));
 		print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in;
 		$count = scalar keys %files_in;
-		foreach my $file (keys %files_in) {
  
-			next if ( glob ("$file.*.tmp")); 		
+		foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) {
+
+			next if ( glob ("\"$file.*.tmp\"")); 		
  
 			select (undef, undef, undef, 1 + rand 2);	# sleep between 1 and 3 seconds
 			next if (!defined $files_in{$file});	# continue only if it is still valid
@@ -255,7 +295,7 @@ sub ocr {
 		remove_tree ($tmpdir,{ error=> \my $dumb });
 		unlink ("$in_file.$host.tmp");
 		move ( "$in_file.$host.processing", $in_file);
-		exit 0;
+		exit 1;
 	}; 
  
 	my $out_path = $in_path;
@@ -271,7 +311,7 @@ sub ocr {
 	my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: "");
  
 	print "\twritting to $out_file\n" if $DEBUG;
-				
+	
 	my $stime = time;
 	my %pids;
  
@@ -291,8 +331,26 @@ sub ocr {
 		remove_tree ($tmpdir,{ error=> \my $dumb });
 		unlink ("$in_file.$host.tmp");
                 move ( "$in_file.$host.processing", $in_file);
+		print "Error: cannot copy $in_file to temp dir \n" if $DEBUG;
+		syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG;
+		exit 1;
 	};
  
+	# Check if file was signed
+	if (get_sign($tmp_file)) {
+		if (!copy ("$in_file.$host.processing", $proc_file)) {
+	                remove_tree ($tmpdir,{ error=> \my $dumb });
+        	        unlink ("$in_file.$host.tmp");
+	                move ( "$in_file.$host.processing", $in_file);
+        	};
+		move ("$in_file.$host.processing", $out_file);
+       	        unlink ("$in_file.$host.tmp");
+		print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG;
+		syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG;
+
+		exit 0;
+	}
+
 	# Extract pages
 	($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf");
         if ($DEBUG) {
@@ -301,12 +359,13 @@ sub ocr {
                 print "\t\t\t$_" for @err ;
         };
  
+	my ($pages, @pg_w, @pg_h, @pg_r,  @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2);
+	$pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2);
  
-	my ($pages, @pg_w, @pg_h, @pg_r);
-	$pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r);
+	my ($imgs,@page_img,  @img_w, @img_h, @img_t, @img_xppi, @img_yppi);
+	$imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi);
  
-	my ($imgs,@page_img,  @img_w, @img_h, @img_t);
-	$imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t);
+	unlink ($tmp_file) if (!$DEBUG);
  
 	for ( my $i=0; $i< $pages; $i++ ) {
 		my $pg = sprintf ("pg_%06d", $i+1);
@@ -333,25 +392,29 @@ sub ocr {
 			if (! defined $img_t[$i] ) {
 				move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
 				print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG;
-				exit 0;
+				exit -1;
 			}
  
-			print "\t\t${in_file}: ".(${i}+1)." / $pages:  $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG;
+			print "\t\t${in_file}: ".(${i}+1)." / $pages:  $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG;
+			print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG);
+			print "\n" if ($DEBUG);
  
+			# Extract images from page, since 2.0 uses png lossless format regardless of original format or depth
 			undef $cmd;
  
-			if ($img_t[$i] eq "gray") {
-				$cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
+			# Use PDFIMAGES and JPEG by default
+			$cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
+
+			if ($img_t[$i] eq "stencil") {
+				$cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i]  \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
 			}
  
-			if ($img_t[$i] eq "rgb") {
-				$cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i]  \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
-				$pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM
+			if ($img_t[$i] eq "gray") {
+				$cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
 			}
  
-			if (!defined $cmd) {
-				$cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i]  \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
-				$pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM
+			if ($img_t[$i] !~ /gray|rgb|stencil/) {
+				$cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
 			}
  
 			($exit,$cmd,@out,@err) = exec_cmd($cmd);
@@ -362,7 +425,13 @@ sub ocr {
                         };
  
 			# Process each resulting image for page pdf
-			my @images = ( find ( file => name =>  qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ;
+			my @images = ( find ( file => name =>  qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ;
+
+			if (scalar @images == 0)  {
+				move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
+				print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG;
+				exit 0;
+			}
  
 			foreach my $image (@images) { 
 				print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG;
@@ -378,43 +447,65 @@ sub ocr {
 						print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n";
 					}
 				}
-	
-				# Check if page was rotated
-				if ($pg_r[$i]) {
-					print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG;
-					($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\"");
+
+				# Check if page was rotated and extracted with pdftoppm
+				if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) {
+					print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG;
+					($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\"");
 					if ($DEBUG) { 
 						print "\t\t\t${image} -> $cmd: $exit\n";
 						print "\t\t\t\t$_" for @out ;
 						print "\t\t\t\t$_" for @err ;
 					};
 				}
-
+	
 				# Filter ppm images, if needed
  
 				# OCR ppm images to pdf pages
-				($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf");
+				($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf");
 				if ($DEBUG) { 
 					print "\t\t\t${image} -> $cmd: $exit\n";
 					print "\t\t\t\t$_" for @out ;
 					print "\t\t\t\t$_" for @err ;
 				};
+				unlink ("$image") if (!$DEBUG);
  
-				# Scale to fit pdf
-				($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
+				# Scale, crop and rotate to fit pdf
+				($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\"  \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
 				if ($DEBUG) { 
 					print "\t\t\t${image} -> $cmd: $exit\n";
 					print "\t\t\t\t$_" for @out ;
 					print "\t\t\t\t$_" for @err ;
 				};
+				unlink ("$image.pdf") if (!$DEBUG);
  
+				if (defined $pg_crop_x1[$i]) {
+					# adjust cropbox
+					($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = (
+						($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]),
+						($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]),
+						abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i])
+					);
+
+					($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
+					if ($DEBUG) {
+                                        	print "\t\t\t${image} -> $cmd: $exit\n";
+	                                        print "\t\t\t\t$_" for @out ;
+	                                        print "\t\t\t\t$_" for @err ;
+        	                        };
+				}
+
+				if ($pg_r[$i]) {
+					($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
+					if ($DEBUG) { 
+						print "\t\t\t${image} -> $cmd: $exit\n";
+						print "\t\t\t\t$_" for @out ;
+						print "\t\t\t\t$_" for @err ;
+					};
+				}
  
-				unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG);
-				unlink ("$image.pdf") if (!$DEBUG);
-				move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG);
-				unlink ("$image") if (!$DEBUG);
 			}
-			exit 0;
+			exit 1;
 		}
 	}
  
@@ -427,28 +518,51 @@ sub ocr {
  
 	if (scalar @new_pages != $pages) {
 		print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
-		syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG);
+		syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG);
 		unlink "$in_file.$host.tmp";
+		make_path ($error_path) if ( ! -d $error_path);
 		move ("$in_file.$host.processing", $error_file);
-		exit (0);
+		exit (1);
 	}
  
-	# Merge resulting pdf pages to a single pdf
+	# Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output
 	make_path ($out_path) if ( ! -d $out_path);
 	unlink $out_file if ( -f $out_file );
-	($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress");
+
+	chdir (${tmpdir});
+	($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\"  pg_*-cpdf.pdf ");
 	if ($DEBUG) {
 		print "\t\t${out_file} -> $cmd: $exit\n";
 	        print "\t\t\t$_" for @out ;
         	print "\t\t\t$_" for @err ;
 	};
+	if ($exit) {
+		unlink "$in_file.$host.tmp";
+		unlink $out_file;
+		make_path ($error_path) if ( ! -d $error_path);
+                move ("$in_file.$host.processing", $error_file);
+		print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
+                syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG);
+                exit (1);
+        }
+	chdir ("/"); 
+
+	if (!copy (${tmp_file}, $out_file)) {
+                remove_tree ($tmpdir,{ error=> \my $dumb });
+                unlink ("$in_file.$host.tmp");
+		unlink $out_file;
+		make_path ($error_path) if ( ! -d $error_path);
+                move ("$in_file.$host.processing", $error_file);
+		print "Error: cannot copy temp file to $out_file \n" if $DEBUG;
+		syslog ("error","cannot copy temp file to $out_file") if !$DEBUG;
+		exit 1;
+        };
  
 	make_path ($proc_path) if ( ! -d $proc_path);
 	unlink $proc_file if ( -f $proc_file );
 	move ("$in_file.$host.processing", $proc_file);
 	move ("${out_file}.tmp", ${out_file});
  
-
 	# Remove temp dir
 	remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG);
 	unlink $tmp_file if (!$DEBUG);
@@ -471,7 +585,7 @@ sub is_ocred {
 }	
  
 sub get_pages {
-	my ($in_file, $w, $h, $r) = @_;
+	my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_;
  
 	my $pages=0;
 	my $i=0;
@@ -485,29 +599,35 @@ sub get_pages {
 		($dumb, $pages) = split / {1,}/  if ( $_ =~ /NumberOfPages:/ );
 		($dumb, $i )    = split / {1,}/  if ( $_ =~ /PageMediaNumber:/ );
 		($dumb, @$r[$i-1]) = split / {1,}/  if ( $_ =~ /PageMediaRotation:/ );
-		($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/  if ( $_ =~ /PageMediaDimensions:/ );
+		($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ );
+		($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ );
 	}
  
 	return $pages;
 }
  
 sub get_imgs {
-	my ($in_file, $page_img, $w, $h, $t) = @_;
-        my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc);
+	my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_;
+        my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi );
  
 	my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\"");
+	$i = 0;
  
 	foreach my $line (@lines)  {
                 chomp $line;
 		$line =~ s/^ {1,}//;
-		if ( $line =~  /image|mask/ ) {
-			($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line;
+		if ( $line !~  /^page|^----/ ) {
+			($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line;
 			@$page_img[$page-1]=$i;
 			@$w[$page-1] = $width;
 			@$h[$page-1] = $height;
 			@$t[$page-1] = "rgb"; 	# Default is color
-			@$t[$page-1] = ( $comp == 3 || $bpc >  1 || $enc   eq "jpeg" || $color eq "-"    || $color eq "icc"  ? "rgb"  : @$t[$page-1]); 
 			@$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc   eq "ccitt"|| $color eq "gray" ||  $type eq "mask" ? "gray" : @$t[$page-1]); 
+			@$t[$page-1] = ( $comp == 3 || $bpc >  1 || $enc   eq "jpeg" || $color eq "-"    || $color eq "icc"  ? "rgb"  : @$t[$page-1]); 
+			@$t[$page-1] = ( $type eq "stencil" ? $type :  @$t[$page-1]);
+			@$t[$page-1] = ( $enc  eq "image"   ? $enc  :  @$t[$page-1]);
+			@$x_ppi[$page-1] = $xppi;
+			@$y_ppi[$page-1] = $yppi;
 		}
         }
 	return $i+1;
@@ -542,6 +662,19 @@ sub get_res {
 	return ($res_x,$res_y);
 }
  
+sub get_sign {
+        my ($in_file) = @_;
+        my @lines = `${PDFSIG} \"${in_file}\"  2>/dev/null`;
+
+        foreach (@lines)  {
+                chomp;
+                if ( $_ =~ /^Signature/ ) {
+			return 1;
+		}
+        }
+        return 0;
+}
+
 sub is_locked_ex {
     my ($path) = @_;
...	...	@@ -0,0 +1,103 @@
	1	+
	2	+FROM ubuntu:14.04
	3	+
	4	+# Cópia de arquivos do projeto OCR-SERVER
	5	+COPY usr/local/bin/ocr /usr/local/bin/ocr
	6	+COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr
	7	+COPY entrypoint.sh /entrypoint.sh
	8	+
	9	+WORKDIR /tmp
	10	+
	11	+# Instalação dos pacotes pré-requisitos do ocr-server 2
	12	+RUN apt-get -y update && \
	13	+ apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \
	14	+ curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \
	15	+ zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \
	16	+ wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \
	17	+ libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \
	18	+ libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \
	19	+ rsyslog libsys-syslog-perl && \
	20	+ apt-get -y clean all
	21	+
	22	+RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \
	23	+ dpkg -i mscorefonts.deb && \
	24	+ rm mscorefonts.deb
	25	+
	26	+# Instalação do Perl 5.1 e demais módulos
	27	+RUN perl -MCPAN -e 'install File::Touch'
	28	+RUN perl -MCPAN -e 'install File::Find::Rule;'
	29	+RUN perl -MCPAN -e 'install File::Touch;'
	30	+RUN perl -MCPAN -e 'install Sys::Syslog;'
	31	+RUN perl -MCPAN -e 'install IPC::Open3;'
	32	+RUN perl -MCPAN -e 'install IO::Select;'
	33	+
	34	+# Tesseract-ocr 3.05, com dicionários inglês e português
	35	+# Bibliotecas para o Tesseract: Leptonica
	36	+RUN git clone https://github.com/DanBloomberg/leptonica.git && \
	37	+ cd leptonica && ./autobuild && ./configure && make all install && \
	38	+ rm -rf ../leptonica
	39	+
	40	+# Bibliotecas para o Tesseract: Libav
	41	+RUN git clone https://github.com/libav/libav.git && \
	42	+ export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \
	43	+ cd libav && ./configure --enable-sram && make all install && \
	44	+ rm -rf ../libav
	45	+
	46	+# Tesseract 3.05.01
	47	+RUN git clone https://github.com/tesseract-ocr/tesseract.git && \
	48	+ cd tesseract && ./autogen.sh && ./configure && make all install && \
	49	+ rm -rf ../tesseract
	50	+
	51	+RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \
	52	+ wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \
	53	+ wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata
	54	+
	55	+# Poppler 0.56
	56	+RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \
	57	+ cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && \
	58	+ rm -rf ../poppler
	59	+
	60	+# pdftk, versão 2.02 ou superior
	61	+RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \
	62	+ unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \
	63	+ cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \
	64	+ rm -rf ../pdftk-2.02-dist
	65	+
	66	+# Ghostscript 9.18 ou superior
	67	+RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \
	68	+ tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \
	69	+ cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \
	70	+ rm -rf ../ghostscript-9.18
	71	+
	72	+# CPDF Intel OS X v 2.2
	73	+RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \
	74	+ cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin
	75	+
	76	+# Atualização das configurações do ld
	77	+RUN ldconfig
	78	+
	79	+RUN useradd -m ocr
	80	+
	81	+RUN chmod +x /usr/local/bin/ocr && \
	82	+ chmod +x /etc/init.d/ocr && \
	83	+ update-rc.d ocr defaults
	84	+
	85	+RUN mkdir /var/ocr-server/ && \
	86	+ mkdir -p /var/ocr-server/Entrada && \
	87	+ mkdir -p /var/ocr-server/Saida && \
	88	+ mkdir -p /var/ocr-server/Originais_Processados && \
	89	+ mkdir -p /var/ocr-server/Erro && \
	90	+ chmod +x /entrypoint.sh
	91	+
	92	+RUN mkdir -p /tmp/ocr_dev/ && \
	93	+ mkdir -p /tmp/ocr_dev/Entrada && \
	94	+ mkdir -p /tmp/ocr_dev/Saida && \
	95	+ mkdir -p /tmp/ocr_dev/Originais_Processados && \
	96	+ mkdir -p /tmp/ocr_dev/Erro && \
	97	+ chmod -R 777 /tmp/ocr_dev
	98	+
	99	+WORKDIR /
	100	+
	101	+VOLUME /var/ocr-server/
	102	+
	103	+CMD ["bash", "/entrypoint.sh"]
0	104	\ No newline at end of file
...	...
...	...	@@ -1,202 +0,0 @@
1		-# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees
2		-#
3		-# This script monitors a set of input directories for PDF files
4		-# once a new file is detected, it is processes through tesseract OCR
5		-# in order to generate a new file with a hidden searchable text layer
6		-#
7		-# It may be distributed under the conditions of the LGPL v2.1 license.
8		-#
9		-# Author: Guilherme Chehab
10		-#
11		-# Version History:
12		-# 0.1 Initial single server version
13		-# 0.2 Check if page already has the html hidden layer, if so, ignore it
14		-# 0.3 Solved issues about various image enconding types
15		-# 0.4 Added a postnormalization step to ensure all output pdf pages have
16		-# the same size and orientations as the original files
17		-# 0.5 Used input file renaming as a way to sync multiple parallel instances,
18		-# that way, it is minimized the risk of same file being OCRed multiple times.
19		-# 0.6 Added a default handler for unknown image encoding using jpeg encoding
20		-# 0.7 Solved an issue with files with more than 1000 pages
21		-# 1.0 First release version
22		-# 1.0.1 Solving error when file has no images
23		-# 1.0.2 Fix bug when counting cores for AMD processors
24		-# 1.0.3 Added better image type detection
25		-# 1.0.4 Fix: added ubuntu init script
26		-# 1.0.4b Centos 6.9
27		-#
28		-# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
29		-# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
30		-# diferently but does not treat it adequately
31		-# - Review poppler and cpdf install instructions
32		-# - Add better handling of vectorized and non scanned pdf files
33		-# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
34		-# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
35		-#
36		-# Check software requirements on the comments bellow
37		-#
38		-# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables
39		-#
40		-#
41		-# O servidor OCR depende dos seguintes componentes:
42		-# - Perl 5.10.1, com seguintes módulos:
43		-# - File::Find::Rule
44		-# - File::Basename
45		-# - File::Copy
46		-# - File::Path
47		-# - File::Touch
48		-# - Sys::Syslog
49		-# - Sys::Hostname
50		-# - IPC::Open3
51		-# - IO::Select
52		-# - POSIX
53		-# - Tesseract-ocr 3.05, com dicionários inglês e português
54		-# - Pdftk 2.02
55		-# - Poppler-utils 0.42.0
56		-# - Cpdf 2.1
57		-# - ImageMagick 6.7.2-7
58		-#
59		-# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
60		-# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
61		-# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
62		-#
63		-## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
64		-#
65		-# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
66		-#
67		-# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script
68		-# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
69		-# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
70		-# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
71		-# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
72		-# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
73		-# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
74		-#
75		-# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
76		-#
77		-#
78		-# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root)
79		-#
80		-#
81		-# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS
82		-#
83		-# RedHat 6.7 e Centos 6.9:
84		-yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
85		-yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel
86		-cd /tmp
87		-wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
88		-rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
89		-rm -f msttcore-fonts-2.0-3.noarch.rpm
90		-
91		-# Centos 6.9
92		-# \_ autoconf-archive
93		-wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
94		-rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
95		-rm autoconf-archive-2012.04.07-7.3.noarch.rpm
96		-# \_ GCC 4.8
97		-wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
98		-yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
99		-
100		-# Ubuntu 14.04 Server:
101		-apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14
102		-apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev
103		-apt-get install ttf-mscorefonts-installer
104		-
105		-# Ambas plataformas:
106		-cd /usr/local/src
107		-
108		-for i in \
109		- https://github.com/tesseract-ocr/langdata.git \
110		- https://github.com/DanBloomberg/leptonica.git \
111		- https://github.com/libav/libav.git \
112		- https://github.com/tesseract-ocr/tessdata.git \
113		- https://github.com/tesseract-ocr/tesseract.git \
114		- git://git.freedesktop.org/git/poppler/poppler.git \
115		- git://git.freedesktop.org/git/poppler/test.git \
116		- https://github.com/Flameeyes/unpaper.git \
117		- https://github.com/ocaml/ocaml.git \
118		- https://gitlab.camlcity.org/gerd/lib-findlib.git \
119		- https://github.com/johnwhitington/camlpdf.git \
120		- https://github.com/johnwhitington/cpdf-source.git \
121		-; do git clone $i; done
122		-
123		-wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
124		-unzip pdftk-2.02-src.zip
125		-rm -f pdftk-2.02-src.zip
126		-
127		-# pdftk, versão 2.02 ou superior
128		-cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
129		-
130		-# Centos 6.9
131		-# \_ Cria um novo shell usando o GCC 4.8 por default
132		-scl enable devtoolset-2 bash
133		-
134		-# Tesseract, versão 3.05-dev ou superior
135		-# Bibliotecas para o Tesseract: Leptonica e Libav
136		-cd leptonica && ./autobuild && ./configure && make all install && cd ..
137		-
138		-# Para compilação do Tesseract após a compilação do leptonica
139		-export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
140		-
141		-cd libav && ./configure --enable-sram && make all install && cd ..
142		-
143		-# Tesseract
144		-cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
145		-cp -avR tessdata/* /usr/local/share/tessdata/
146		-
147		-# cpdf, versão 2.1 ou superior
148		-cd ocaml && ./configure && make world.opt && make install && cd ..
149		-mkdir -p /usr/local/man/man5
150		-# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente
151		-cd lib-findlib && ./configure && make all && make install && cd ..
152		-cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
153		-cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
154		-
155		-# poppler-utils, versão 0.42.0 ou superior
156		-cd poppler && ./autogen.sh && ./configure && make all install && cd ..
157		-
158		-# Centos 6.9
159		-# \_ Termina o shell usando o GCC 4.8 por default
160		-exit
161		-
162		-# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root)
163		-
164		-## Comandos adicionais para configuração do módulo:
165		-
166		-# Criação do usuário
167		-adduser ocr
168		-
169		-# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
170		-cp ./usr/local/bin/ocr /usr/local/bin
171		-
172		-# Auto start (RedHat 6.7 e CentOs 6.9)
173		-cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr
174		-mv /etc
175		-chkconfig --add ocr
176		-chkconfig --level 2345 ocr on
177		-
178		-# Auto start (Ubuntu 14.04)
179		-cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
180		-update-rd.d ocr defaults
181		-
182		-# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
183		-cd /home/ocr
184		-tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc.d/ocr
185		-su
186		-
187		-# Copie o pacote para os outros servidores e extraia com:
188		-cd /
189		-tar xovzf pkg-ocr.tgz
190		-
191		-# Instalando pré-requisitos RUNTIME em servidores adicionais
192		-
193		-# Redhat 6.7 e CentOS 6.9
194		-yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp
195		-yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext
196		-
197		-# Ubuntu 14.04
198		-apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14
199		-apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0
200		-
201		-# Inicie o serviço com
202		-service ocr start
...	...	@@ -0,0 +1,260 @@
	1	+# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees
	2	+
	3	+This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer
	4	+
	5	+It may be distributed under the conditions of the LGPL v2.1 license.
	6	+
	7	+Author: Guilherme Chehab
	8	+
	9	+## Version History:
	10	+ - 0.1
	11	+ - Initial single server version
	12	+ - 0.2
	13	+ - Check if page already has the html hidden layer, if so, ignore it
	14	+ - 0.3
	15	+ - Solved issues about various image enconding types
	16	+ - 0.4
	17	+ - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files
	18	+ - 0.5
	19	+ - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times.
	20	+ - 0.6
	21	+ - Added a default handler for unknown image encoding using jpeg encoding
	22	+ - 0.7
	23	+ - Solved an issue with files with more than 1000 pages
	24	+ - 1.0
	25	+ - First release version
	26	+ - 1.0.1 Solving error when file has no images
	27	+ - 1.0.2 Fix bug when counting cores for AMD processors
	28	+ - 1.0.3 Added better image type detection
	29	+ - 1.0.4 Fix: added ubuntu init script
	30	+ - 1.0.4b Add Centos 6.9 install instructions
	31	+ - 2.0
	32	+ - PDF/A output, and better compression with ghostscript
	33	+ - Rewritten image extration, processing and transformations process
	34	+ - Check if input file is signed, in this case, does not change the file contents
	35	+ - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
	36	+ - Use operating system packges by default
	37	+ - Changed paths from external programs, instead of using full paths, uses first match from $PATH
	38	+ - Check existence of external programs on path before running
	39	+ - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
	40	+ - Fix: create subpaths on error folder
	41	+ - Fix: trying to reduce overhead on temporary folder
	42	+
	43	+## TODO:
	44	+ - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
	45	+ - Review poppler and cpdf install instructions
	46	+ - Add better handling of vectorized and non scanned pdf files
	47	+ - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers
	48	+ - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W.
	49	+ - Move all parameters to config file
	50	+ - Add some job control web interface
	51	+ - Add end user interface to submit files through web
	52	+ - Add check external programs version requirements before running
	53	+
	54	+## BUGS:
	55	+ - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages
	56	+
	57	+## Requirements:
	58	+ - Perl 5.10.1, com seguintes módulos:
	59	+ - File::Find::Rule
	60	+ - File::Basename
	61	+ - File::Copy
	62	+ - File::Path
	63	+ - File::Touch
	64	+ - Sys::Syslog
	65	+ - Sys::Hostname
	66	+ - IPC::Open3
	67	+ - IO::Select
	68	+ - POSIX
	69	+ - Tesseract-ocr 3.05, com dicionários inglês e português
	70	+ - Pdftk 2.02
	71	+ - Poppler-utils 0.42.0
	72	+ - Cpdf 2.1
	73	+ - ImageMagick 6.7.2-7
	74	+ - Ghostcript 9.18
	75	+
	76	+Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
	77	+
	78	+Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
	79	+
	80	+Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
	81	+
	82	+ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
	83	+
	84	+### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
	85	+
	86	+- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script
	87	+- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
	88	+- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
	89	+- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
	90	+
	91	+Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
	92	+
	93	+Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
	94	+
	95	+A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
	96	+
	97	+Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
	98	+
	99	+
	100	+# Container Docker
	101	+
	102	+ O OCR-Server também está disponível como um container Docker, permitindo o rápido provisionamento da solução em ambiente de produção. Todos os procedimento para construção da imagem do container podem ser encontrados no arquivo Dockerfile.
	103	+
	104	+ Para execução do serviço, basta que o docker instalado no servidor e executar o seguinte comando:
	105	+
	106	+ docker run --name <NOME_CONTAINER> -d -v <DIRETORIO_BASE>:/var/ocr-server guilhermeadc/ocr-server
	107	+
	108	+ Onde:
	109	+ --name : Nome atribuído à instância do container. Ex: ocr-server
	110	+ -d : Indicação executar o container em background
	111	+ -v : Diretório de compartilhamento entre o servidor host e o container.
	112	+ O parâmetro <DIRETORIO_BASE> deve ser substituído pelo diretório base para busca de arquivos.
	113	+
	114	+ Para vistualizar os logs de processamento do serviço, basta executar o seguinte comando:
	115	+ docker logs <NOME_CONTAINER>
	116	+
	117	+
	118	+# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root)
	119	+
	120	+Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial)
	121	+são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada
	122	+
	123	+Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries
	124	+
	125	+## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS
	126	+
	127	+ # RedHat 6.7 e Centos 6.9:
	128	+ yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
	129	+ yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel
	130	+ cd /tmp
	131	+ wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
	132	+ rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
	133	+ rm -f msttcore-fonts-2.0-3.noarch.rpm
	134	+
	135	+ # Centos 6.9
	136	+ # \_ autoconf-archive
	137	+ wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
	138	+ rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
	139	+ rm autoconf-archive-2012.04.07-7.3.noarch.rpm
	140	+ # \_ GCC 4.8
	141	+ wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
	142	+ yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
	143	+
	144	+ # Ubuntu 14.04 Server:
	145	+ apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14
	146	+ apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev
	147	+ apt-get install ttf-mscorefonts-installer
	148	+
	149	+ # Ambas plataformas:
	150	+ cd /usr/local/src
	151	+
	152	+ for i in \
	153	+ https://github.com/tesseract-ocr/langdata.git \
	154	+ https://github.com/DanBloomberg/leptonica.git \
	155	+ https://github.com/libav/libav.git \
	156	+ https://github.com/tesseract-ocr/tessdata.git \
	157	+ https://github.com/tesseract-ocr/tesseract.git \
	158	+ git://git.freedesktop.org/git/poppler/poppler.git \
	159	+ git://git.freedesktop.org/git/poppler/test.git \
	160	+ https://github.com/Flameeyes/unpaper.git \
	161	+ https://github.com/ocaml/ocaml.git \
	162	+ https://gitlab.camlcity.org/gerd/lib-findlib.git \
	163	+ https://github.com/johnwhitington/camlpdf.git \
	164	+ https://github.com/johnwhitington/cpdf-source.git \
	165	+ http://git.ghostscript.com/ghostpdl.git \
	166	+ ; do git clone $i; done
	167	+
	168	+ wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
	169	+ unzip pdftk-2.02-src.zip
	170	+ rm -f pdftk-2.02-src.zip
	171	+
	172	+ # pdftk, versão 2.02 ou superior
	173	+ cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
	174	+
	175	+ # Ghostscript 9.18 ou superior
	176	+ #wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.21.tar.gz
	177	+ #tar xvozf ghostscript-9.21.tar.gz
	178	+ #rm -f ghostscript-9.21.tar.gz
	179	+ #cd ghostscript-9.21
	180	+ cd ghostpdl
	181	+ ./autogen.sh; ./configure
	182	+ make all install
	183	+ cd ..
	184	+
	185	+ # Centos 6.9
	186	+ # \_ Cria um novo shell usando o GCC 4.8 por default
	187	+ scl enable devtoolset-2 bash
	188	+
	189	+ # Tesseract, versão 3.05-dev ou superior
	190	+ # Bibliotecas para o Tesseract: Leptonica e Libav
	191	+ cd leptonica && ./autobuild && ./configure && make all install && cd ..
	192	+
	193	+ # Para compilação do Tesseract após a compilação do leptonica
	194	+ export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
	195	+
	196	+ cd libav && ./configure --enable-sram && make all install && cd ..
	197	+
	198	+ # Tesseract
	199	+ cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
	200	+ cp -avR tessdata/* /usr/local/share/tessdata/
	201	+
	202	+ # cpdf, versão 2.1 ou superior
	203	+ cd ocaml && ./configure && make world.opt && make install && cd ..
	204	+ mkdir -p /usr/local/man/man5
	205	+ # lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente
	206	+ cd lib-findlib && ./configure && make all && make install && cd ..
	207	+ cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
	208	+ cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
	209	+
	210	+ # poppler-utils, versão 0.42.0 ou superior
	211	+ cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd ..
	212	+
	213	+ # Centos 6.9
	214	+ # \_ Termina o shell usando o GCC 4.8 por default
	215	+ exit
	216	+
	217	+
	218	+## Comandos adicionais para configuração do módulo:
	219	+
	220	+ # Criação do usuário
	221	+ adduser ocr
	222	+
	223	+ # Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
	224	+ cp ./usr/local/bin/ocr /usr/local/bin
	225	+
	226	+ # Auto start (RedHat 6.7 e CentOs 6.9)
	227	+ cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr
	228	+ mv /etc
	229	+ chkconfig --add ocr
	230	+ chkconfig --level 2345 ocr on
	231	+
	232	+ # Auto start (Ubuntu 14.04)
	233	+ cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
	234	+ update-rd.d ocr defaults
	235	+
	236	+ # Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
	237	+ cd /home/ocr
	238	+ tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc.d/ocr
	239	+ su
	240	+
	241	+# INSTALAÇÃO (obs.: os comandos devem ser executados como root)
	242	+ # Criação do usuário
	243	+ adduser ocr
	244	+
	245	+ # Copie o pacote para os outros servidores e extraia com:
	246	+ cd /
	247	+ tar xovzf pkg-ocr.tgz
	248	+
	249	+ # Instalando pré-requisitos RUNTIME em servidores adicionais
	250	+
	251	+ # Redhat 6.7 e CentOS 6.9
	252	+ yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript
	253	+ yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext
	254	+
	255	+ # Ubuntu 14.04
	256	+ apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14
	257	+ apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript
	258	+
	259	+# Inicie o serviço com
	260	+ service ocr start
...	...
...	...	@@ -0,0 +1,17 @@
	1	+#!/usr/bin/env bash
	2	+
	3	+# Inicializa serviço de log
	4	+/etc/init.d/rsyslog start
	5	+
	6	+# Cria estrutura de pastas para monitoramento de arquivos
	7	+mkdir -p /var/ocr-server/
	8	+mkdir -p /var/ocr-server/Entrada
	9	+mkdir -p /var/ocr-server/Saida
	10	+mkdir -p /var/ocr-server/Originais_Processados
	11	+mkdir -p /var/ocr-server/Erro
	12	+chmod -R 777 /var/ocr-server
	13	+
	14	+# Iniciar serviço do OCR-Server
	15	+service ocr start
	16	+
	17	+tail -f /var/log/syslog
0	18	\ No newline at end of file
...	...