From d2b74559c9a3595de14ad8ce6ba843d92f505213 Mon Sep 17 00:00:00 2001
From: Nei Jobson <neijobson@anatel.gov.br>
Date: Fri, 30 Jun 2017 19:45:35 -0300
Subject: [PATCH] Pré versão 2.0 a ser liberada - PARA O CANTONI INCLUIR O Container Docker

---
 INSTALL.txt       | 202 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 README.md         | 240 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 usr/local/bin/ocr | 269 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------------------------------
 workflow.pdf      | Bin 309369 -> 0 bytes
 workflow.vsd      | Bin 169472 -> 0 bytes
 5 files changed, 440 insertions(+), 271 deletions(-)
 delete mode 100644 INSTALL.txt
 create mode 100644 README.md

diff --git a/INSTALL.txt b/INSTALL.txt
deleted file mode 100644
index 2177136..0000000
--- a/INSTALL.txt
+++ /dev/null
@@ -1,202 +0,0 @@
-#	OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees
-#
-#	This script monitors a set of input directories for PDF files
-#	once a new file is detected, it is processes through tesseract OCR
-#	in order to generate a new file with a hidden searchable text layer
-#
-#	It may be distributed under the conditions of the LGPL v2.1 license.
-#
-#	Author: Guilherme Chehab 
-#
-#	Version History:
-#	0.1	Initial single server version
-#	0.2	Check if page already has the html hidden layer, if so, ignore it
-#	0.3	Solved issues about various image enconding types
-#	0.4	Added a postnormalization step to ensure all output pdf pages have
-#		the same size and orientations as the original files
-#	0.5	Used input file renaming as a way to sync multiple parallel instances,
-#		that way, it is minimized the risk of same file being OCRed multiple times.
-#	0.6	Added a default handler for unknown image encoding using jpeg encoding	
-#	0.7	Solved an issue with files with more than 1000 pages
-#	1.0	First release version
-#	1.0.1	Solving error when file has no images
-#       1.0.2   Fix bug when counting cores for AMD processors
-#       1.0.3   Added better image type detection
-#       1.0.4   Fix: added ubuntu init script
-#       1.0.4b  Centos 6.9
-#
-#       TODO:   - Changes get_imgs and OCR processing to enable pages with more than one image -- it
-#               would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
-#               diferently but does not treat it adequately
-#               - Review poppler and cpdf install instructions
-#               - Add better handling of vectorized and non scanned pdf files
-#               - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
-#               - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
-#
-#	Check software requirements on the comments bellow
-#
-#	To configure input dirs change @BASE_DIRS and @SUB_DIRS variables
-#
-#
-# O servidor OCR depende dos seguintes componentes:
-# - Perl 5.10.1, com seguintes módulos:
-#	- File::Find::Rule
-#	- File::Basename
-#	- File::Copy
-#	- File::Path
-#	- File::Touch
-#	- Sys::Syslog
-#	- Sys::Hostname
-#	- IPC::Open3
-#	- IO::Select
-#	- POSIX
-# - Tesseract-ocr 3.05, com dicionários inglês e português
-# - Pdftk 2.02
-# - Poppler-utils 0.42.0
-# - Cpdf 2.1
-# - ImageMagick 6.7.2-7
-#	
-# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
-# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
-# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
-#
-## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
-#
-# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
-#
-#	    @BASE_DIRS:	Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script 
-#	    @SUB_DIRS:		Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
-#	    $MAX_FILES:	Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
-#	    $MAX_PGS:		Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
-#	 Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
-#	 Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
-#	 A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
-#
-#	 Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
-#
-#
-# -----------------------  COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root)
-#
-#
-# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS 
-#
-# RedHat 6.7 e Centos 6.9:
-yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
-yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel
-cd /tmp
-wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
-rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
-rm -f msttcore-fonts-2.0-3.noarch.rpm
-
-# Centos 6.9
-#   \_ autoconf-archive
-wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
-rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
-rm autoconf-archive-2012.04.07-7.3.noarch.rpm
-#   \_ GCC 4.8
-wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
-yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
-
-# Ubuntu 14.04 Server:
-apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 
-apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev
-apt-get install ttf-mscorefonts-installer
-
-# Ambas plataformas:
-cd /usr/local/src
-
-for i in \
-	https://github.com/tesseract-ocr/langdata.git \
-	https://github.com/DanBloomberg/leptonica.git \
-	https://github.com/libav/libav.git  \
-	https://github.com/tesseract-ocr/tessdata.git \
-	https://github.com/tesseract-ocr/tesseract.git \
-	git://git.freedesktop.org/git/poppler/poppler.git \
-	git://git.freedesktop.org/git/poppler/test.git \
-	https://github.com/Flameeyes/unpaper.git \
-	https://github.com/ocaml/ocaml.git \
-	https://gitlab.camlcity.org/gerd/lib-findlib.git \
-	https://github.com/johnwhitington/camlpdf.git \
-	https://github.com/johnwhitington/cpdf-source.git \
-; do git clone $i; done
-
-wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
-unzip pdftk-2.02-src.zip
-rm -f pdftk-2.02-src.zip
-
-# pdftk, versão 2.02 ou superior
-cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
-
-# Centos 6.9
-#   \_ Cria um novo shell usando o GCC 4.8 por default
-scl enable devtoolset-2 bash  
-
-# Tesseract, versão 3.05-dev ou superior
-# Bibliotecas para o Tesseract: Leptonica e Libav
-cd leptonica && ./autobuild && ./configure && make all install && cd ..
-
-# Para compilação do Tesseract após a compilação do leptonica
-export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
-
-cd libav && ./configure --enable-sram && make all install && cd ..
-
-# Tesseract
-cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
-cp -avR tessdata/* /usr/local/share/tessdata/
-
-# cpdf, versão 2.1 ou superior
-cd ocaml && ./configure && make world.opt && make install && cd ..
-mkdir -p /usr/local/man/man5
-# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente 
-cd lib-findlib  && ./configure && make all && make install && cd ..
-cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
-cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
-
-# poppler-utils, versão 0.42.0 ou superior
-cd poppler && ./autogen.sh && ./configure && make  all install && cd ..
-
-# Centos 6.9
-#   \_ Termina o shell usando o GCC 4.8 por default
-exit
-
-# -----------------------  INSTALAÇÃO (obs.: os comandos de devem ser executados como root)
-
-## Comandos adicionais para configuração do módulo:
-	
-# Criação do usuário
-adduser ocr
-
-# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
-cp ./usr/local/bin/ocr /usr/local/bin
-
-# Auto start (RedHat 6.7 e CentOs 6.9)
-cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr 
-mv /etc
-chkconfig --add ocr
-chkconfig --level 2345 ocr on
-
-# Auto start (Ubuntu 14.04)
-cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
-update-rd.d ocr defaults
-
-# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
-cd /home/ocr
-tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
-su
-
-# Copie o pacote para os outros servidores e extraia com:
-cd /
-tar xovzf pkg-ocr.tgz
-
-# Instalando pré-requisitos RUNTIME em servidores adicionais
-
-# Redhat 6.7 e CentOS 6.9
-yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp
-yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext 
-
-# Ubuntu 14.04
-apt-get install  libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 
-apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0
-
-# Inicie o serviço com
-service ocr start
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..ceb52ee
--- /dev/null
+++ b/README.md
@@ -0,0 +1,240 @@
+#	OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees
+
+This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer
+
+It may be distributed under the conditions of the LGPL v2.1 license.
+
+Author: Guilherme Chehab 
+
+##	Version History:
+ - 0.1
+ 	- Initial single server version
+ - 0.2
+ 	- Check if page already has the html hidden layer, if so, ignore it
+ - 0.3
+ 	- Solved issues about various image enconding types
+ - 0.4
+ 	- Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files
+ - 0.5
+ 	- Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times.
+ - 0.6
+ 	- Added a default handler for unknown image encoding using jpeg encoding
+ - 0.7
+ 	- Solved an issue with files with more than 1000 pages
+ - 1.0
+ 	- First release version
+ 	- 1.0.1   Solving error when file has no images
+ 	- 1.0.2   Fix bug when counting cores for AMD processors
+ 	- 1.0.3   Added better image type detection
+ 	- 1.0.4   Fix: added ubuntu init script
+ 	- 1.0.4b  Add Centos 6.9 install instructions
+ - 2.0
+ 	- PDF/A output, and better compression with ghostscript
+	- Rewritten image extration, processing and transformations process
+	- Check if input file is signed, in this case, does not change the file contents
+	- Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
+	- Use operating system packges by default
+	- Changed paths from external programs, instead of using full paths, uses first match from $PATH
+	- Check existence of external programs on path before running
+	- Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
+	- Fix: create subpaths on error folder
+	- Fix: trying to reduce overhead on temporary folder
+	- TODO:
+		- Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
+		- Review poppler and cpdf install instructions
+		- Add better handling of vectorized and non scanned pdf files
+		- Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers
+		- Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W.
+		- Move all parameters to config file
+		- Add some job control web interface
+		- Add end user interface to submit files through web
+		- Add check external programs version requirements before running
+	- BUGS:
+		- When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages
+
+##	Check software requirements on the comments bellow
+
+To configure input dirs change @BASE_DIRS and @SUB_DIRS variables
+
+### O servidor OCR depende dos seguintes componentes:
+ - Perl 5.10.1, com seguintes módulos:
+	- File::Find::Rule
+	- File::Basename
+	- File::Copy
+	- File::Path
+	- File::Touch
+	- Sys::Syslog
+	- Sys::Hostname
+	- IPC::Open3
+	- IO::Select
+	- POSIX
+ - Tesseract-ocr 3.05, com dicionários inglês e português
+ - Pdftk 2.02
+ - Poppler-utils 0.42.0
+ - Cpdf 2.1
+ - ImageMagick 6.7.2-7
+ - Ghostcript 9.18
+	
+Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
+
+Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
+
+Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
+
+ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
+
+### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
+
+- @BASE_DIRS:	Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script 
+- @SUB_DIRS:		Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
+- $MAX_FILES:	Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
+- $MAX_PGS:		Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
+
+Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
+
+Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
+
+A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
+
+Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
+
+# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root)
+
+Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial) 
+são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada
+
+Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries
+
+## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS 
+
+### RedHat 6.7 e Centos 6.9:
+	yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
+	yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel
+	cd /tmp
+	wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
+	rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
+	rm -f msttcore-fonts-2.0-3.noarch.rpm
+
+### Centos 6.9
+#   \_ autoconf-archive
+	wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
+	rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
+	rm autoconf-archive-2012.04.07-7.3.noarch.rpm
+#   \_ GCC 4.8
+	wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
+	yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
+
+# Ubuntu 14.04 Server:
+	apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 
+	apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev  libnss3-dev
+	apt-get install ttf-mscorefonts-installer
+
+# Ambas plataformas:
+	cd /usr/local/src
+
+	for i in \
+		https://github.com/tesseract-ocr/langdata.git \
+		https://github.com/DanBloomberg/leptonica.git \
+		https://github.com/libav/libav.git  \
+		https://github.com/tesseract-ocr/tessdata.git \
+		https://github.com/tesseract-ocr/tesseract.git \
+		git://git.freedesktop.org/git/poppler/poppler.git \
+		git://git.freedesktop.org/git/poppler/test.git \
+		https://github.com/Flameeyes/unpaper.git \
+		https://github.com/ocaml/ocaml.git \
+		https://gitlab.camlcity.org/gerd/lib-findlib.git \
+		https://github.com/johnwhitington/camlpdf.git \
+		https://github.com/johnwhitington/cpdf-source.git \
+		http://git.ghostscript.com/ghostpdl.git \
+	; do git clone $i; done
+
+	wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
+	unzip pdftk-2.02-src.zip
+	rm -f pdftk-2.02-src.zip
+
+# pdftk, versão 2.02 ou superior
+cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
+
+# Ghostscript 9.18 ou superior
+#wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz
+#tar xvozf ghostscript-9.21.tar.gz
+#rm -f ghostscript-9.21.tar.gz
+#cd ghostscript-9.21
+cd ghostpdl
+./autogen.sh; ./configure
+make all install
+cd ..
+
+# Centos 6.9
+#   \_ Cria um novo shell usando o GCC 4.8 por default
+scl enable devtoolset-2 bash  
+
+# Tesseract, versão 3.05-dev ou superior
+# Bibliotecas para o Tesseract: Leptonica e Libav
+cd leptonica && ./autobuild && ./configure && make all install && cd ..
+
+# Para compilação do Tesseract após a compilação do leptonica
+export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
+
+cd libav && ./configure --enable-sram && make all install && cd ..
+
+# Tesseract
+cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
+cp -avR tessdata/* /usr/local/share/tessdata/
+
+# cpdf, versão 2.1 ou superior
+cd ocaml && ./configure && make world.opt && make install && cd ..
+mkdir -p /usr/local/man/man5
+# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente 
+cd lib-findlib  && ./configure && make all && make install && cd ..
+cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
+cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
+
+# poppler-utils, versão 0.42.0 ou superior
+cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make  all install && cd ..
+
+# Centos 6.9
+#   \_ Termina o shell usando o GCC 4.8 por default
+exit
+
+# -----------------------  INSTALAÇÃO (obs.: os comandos devem ser executados como root)
+
+## Comandos adicionais para configuração do módulo:
+	
+# Criação do usuário
+adduser ocr
+
+# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
+cp ./usr/local/bin/ocr /usr/local/bin
+
+# Auto start (RedHat 6.7 e CentOs 6.9)
+cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr 
+mv /etc
+chkconfig --add ocr
+chkconfig --level 2345 ocr on
+
+# Auto start (Ubuntu 14.04)
+cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
+update-rd.d ocr defaults
+
+# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
+cd /home/ocr
+tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
+su
+
+# Copie o pacote para os outros servidores e extraia com:
+cd /
+tar xovzf pkg-ocr.tgz
+
+# Instalando pré-requisitos RUNTIME em servidores adicionais
+
+# Redhat 6.7 e CentOS 6.9
+yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript
+yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext 
+
+# Ubuntu 14.04
+apt-get install  libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 
+apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript
+
+# Inicie o serviço com
+service ocr start
diff --git a/usr/local/bin/ocr b/usr/local/bin/ocr
index 2af3d31..edff34c 100644
--- a/usr/local/bin/ocr
+++ b/usr/local/bin/ocr
@@ -1,6 +1,6 @@
-#! /usr/bin/perl -w
+#!/usr/bin/perl -w
 #
-#	OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes
+#	OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes
 #
 #	This script monitors a set of input directories for PDF files
 #	once a new file is detected, it is processes through tesseract OCR
@@ -24,15 +24,38 @@
 #	1.0.1	Solving error when file has no images
 #	1.0.2	Fix bug when counting cores for AMD processors
 #	1.0.3	Added better image type detection	
-#   1.0.4   Fix: added ubuntu init script
+#       1.0.4   Fix: added ubuntu init script
+#	1.0.4b	Add Centos 6.9 install instructions
+#	2.0	PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is 
+#		strongly recomended
+#		Rewritten image extration, processing and transformations process
+#		Check if input file is signed, in this case, does not change the file contents
+#		Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
+#		Use operating system packges by default
+#		Changed paths from external programs, instead of using full paths, uses first match from $PATH
+#		Check existence of external programs on path before running
+#		Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
+#		Fix: create subpaths on error folder
+#		Fix: trying to reduce overhead on temporary folder
 #
 #	TODO: 	- Changes get_imgs and OCR processing to enable pages with more than one image -- it
-#		would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
-#		diferently but does not treat it adequately
+#		would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them
+#		diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
 #		- Review poppler and cpdf install instructions
 #		- Add better handling of vectorized and non scanned pdf files
-#		- Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
-#		- Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
+#		- Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current 
+#		scalling, cropping and rotation handlers
+#		- Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- 
+#		added function to analyse image color histogram -> just need to add option to convert it to B&W.
+#		- Move all parameters to config file
+#		- Add some job control web interface
+#		- Add end user interface to submit files through web
+#		- Add check external programs version requirements before running
+#
+#	BUGS:	- When image is of type stencil or encoding image, cropping information is lost, and page is shown different than
+#		original, this is due to using pdftoppm instead of pdfimages 
+#		- Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions
+#		increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server
 #
 #	Check software requirements on the comments bellow
 #
@@ -54,8 +77,8 @@ use Sys::Hostname;
 use IPC::Open3;
 use IO::Select;
 
-my $DEBUG = 0;
-my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo  | grep -e '^processor' | wc -l`);
+my $DEBUG = 2;
+my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo  | grep -e '^processor' | wc -l`);
 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ;
 
 my $USER = 'ocr';
@@ -63,23 +86,28 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca
 
 # Command dependencies
 
-# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher
-my $TESSERACT = '/usr/local/bin/tesseract -l por+eng';
+# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended
+my $TESSERACT = 'tesseract --oem 0'; 		# if Tesseract => 4.0
+#my $TESSERACT = 'tesseract';			# if Tesseract < 4.0
 
 # Depends on pdftk 2.02 or higher
-my $PDFTK = '/usr/local/bin/pdftk';
+my $PDFTK = 'pdftk';
 
 # Depends on poppler-utils 0.42.0 or higher
-#my $PDINFO = '/usr/local/bin/pdfinfo';
-my $PDFFONTS = '/usr/local/bin/pdffonts';
-my $PDFIMAGES = '/usr/local/bin/pdfimages';
-my $PDFTOPPM = '/usr/local/bin/pdftoppm';
+my $PDFFONTS = 'pdffonts';
+my $PDFIMAGES = 'pdfimages';
+my $PDFTOPPM = 'pdftoppm';
+my $PDFUNITE = 'pdfunite';
+my $PDFSIG = 'pdfsig';
 
 # Depends on cpdf 2.1 or higher
-my $CPDF = '/usr/local/bin/cpdf';
+my $CPDF = 'cpdf';
+
+# Depends on Ghostscript 9.18
+my $GS = 'gs';
 
 ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner
-my $CONVERT = '/usr/bin/convert';
+my $CONVERT = 'convert';
 
 # If it is needed further filtering
 #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';
@@ -90,12 +118,14 @@ my @BASE_DIRS = (	'/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/',
 my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' );
 
 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2);
-%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG);
+%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2);
 
 # Safeguard im case of cpuinfo has not identified correctly the number of CPUs 
 $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS;
 
-$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin';
+$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin';
+$ENV{'IFS'} = '\t\n';
+
 my ($host) = split/\./,hostname;
 
 use vars qw/*name *dir *prune/;
@@ -107,14 +137,15 @@ sub main;
 sub get_pages;
 sub get_rotation;
 sub get_res;
-sub is_ocred;
 sub is_locked_ex;
 
 
 my $expr = 'use POSIX qw(setsid)';
 
 my ($dumb1, $dumb2, $uid) = getpwnam ($USER);
-setuid ($uid) or warn "Cant set uid $uid";
+if (defined $uid) {
+	setuid ($uid) or warn "Cant set uid $uid";
+}
 
 $SIG{__DIE__}  = 'DEFAULT';
 $SIG{__WARN__} = \&die_when_called;
@@ -126,6 +157,11 @@ if ($@) {
 chdir('/') or die "$0: cannot chdir '/': $!\n";
 open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n";
 
+foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) {
+	die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0);
+}
+
+
 foreach my $DIR (@BASE_DIRS) {
 
     defined(my $pid = fork) or die "$0: cannot fork: $!\n";
@@ -135,7 +171,7 @@ foreach my $DIR (@BASE_DIRS) {
 	main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); 
 	exit 0;
 	last;
-    } 
+    }
 }
 
 exit 0;
@@ -157,7 +193,7 @@ sub main {
 	#  remove .tmp file
 	unlink ( find ( file => name =>  qr/\.${host}\.tmp$/i , in => ${IN} ) );
 
-	# Rename files that were in 'processig' back
+	# Rename files that were in 'processing' state back
 	foreach my $file ( find ( file => name =>  qr/\.${host}\.processing$/i , in => ${IN} ) ) {
 		my $old_name = $file;
 		$old_name =~ s/\.${host}\.processing$//g;
@@ -177,12 +213,14 @@ sub main {
 	# Main loop
 	while ( 1 ) {	
 		select (undef, undef, undef, rand 3); 	# Random sleep so multiple instances dont get synced
+
 		$files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name =>  qr/\.pdf$/i , in => ${IN} ));
 		print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in;
 		$count = scalar keys %files_in;
-		foreach my $file (keys %files_in) {
 
-			next if ( glob ("$file.*.tmp")); 		
+		foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) {
+
+			next if ( glob ("\"$file.*.tmp\"")); 		
 
 			select (undef, undef, undef, 1 + rand 2);	# sleep between 1 and 3 seconds
 			next if (!defined $files_in{$file});	# continue only if it is still valid
@@ -255,7 +293,7 @@ sub ocr {
 		remove_tree ($tmpdir,{ error=> \my $dumb });
 		unlink ("$in_file.$host.tmp");
 		move ( "$in_file.$host.processing", $in_file);
-		exit 0;
+		exit 1;
 	}; 
 
 	my $out_path = $in_path;
@@ -271,7 +309,7 @@ sub ocr {
 	my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: "");
 
 	print "\twritting to $out_file\n" if $DEBUG;
-				
+	
 	my $stime = time;
 	my %pids;
 
@@ -291,8 +329,26 @@ sub ocr {
 		remove_tree ($tmpdir,{ error=> \my $dumb });
 		unlink ("$in_file.$host.tmp");
                 move ( "$in_file.$host.processing", $in_file);
+		print "Error: cannot copy $in_file to temp dir \n" if $DEBUG;
+		syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG;
+		exit 1;
 	};
 
+	# Check if file was signed
+	if (get_sign($tmp_file)) {
+		if (!copy ("$in_file.$host.processing", $proc_file)) {
+	                remove_tree ($tmpdir,{ error=> \my $dumb });
+        	        unlink ("$in_file.$host.tmp");
+	                move ( "$in_file.$host.processing", $in_file);
+        	};
+		move ("$in_file.$host.processing", $out_file);
+       	        unlink ("$in_file.$host.tmp");
+		print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG;
+		syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG;
+
+		exit 0;
+	}
+
 	# Extract pages
 	($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf");
         if ($DEBUG) {
@@ -301,12 +357,13 @@ sub ocr {
                 print "\t\t\t$_" for @err ;
         };
 
+	my ($pages, @pg_w, @pg_h, @pg_r,  @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2);
+	$pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2);
 
-	my ($pages, @pg_w, @pg_h, @pg_r);
-	$pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r);
+	my ($imgs,@page_img,  @img_w, @img_h, @img_t, @img_xppi, @img_yppi);
+	$imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi);
 
-	my ($imgs,@page_img,  @img_w, @img_h, @img_t);
-	$imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t);
+	unlink ($tmp_file) if (!$DEBUG);
 
 	for ( my $i=0; $i< $pages; $i++ ) {
 		my $pg = sprintf ("pg_%06d", $i+1);
@@ -333,25 +390,29 @@ sub ocr {
 			if (! defined $img_t[$i] ) {
 				move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
 				print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG;
-				exit 0;
+				exit -1;
 			}
 
-			print "\t\t${in_file}: ".(${i}+1)." / $pages:  $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG;
+			print "\t\t${in_file}: ".(${i}+1)." / $pages:  $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG;
+			print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG);
+			print "\n" if ($DEBUG);
 
+			# Extract images from page, since 2.0 uses png lossless format regardless of original format or depth
 			undef $cmd;
 
-			if ($img_t[$i] eq "gray") {
-				$cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
+			# Use PDFIMAGES and JPEG by default
+			$cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
+
+			if ($img_t[$i] eq "stencil") {
+				$cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i]  \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
 			}
 
-			if ($img_t[$i] eq "rgb") {
-				$cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i]  \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
-				$pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM
+			if ($img_t[$i] eq "gray") {
+				$cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
 			}
 
-			if (!defined $cmd) {
-				$cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i]  \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
-				$pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM
+			if ($img_t[$i] !~ /gray|rgb|stencil/) {
+				$cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
 			}
 		
 			($exit,$cmd,@out,@err) = exec_cmd($cmd);
@@ -362,7 +423,13 @@ sub ocr {
                         };
 
 			# Process each resulting image for page pdf
-			my @images = ( find ( file => name =>  qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ;
+			my @images = ( find ( file => name =>  qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ;
+
+			if (scalar @images == 0)  {
+				move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
+				print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG;
+				exit 0;
+			}
 
 			foreach my $image (@images) { 
 				print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG;
@@ -378,43 +445,65 @@ sub ocr {
 						print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n";
 					}
 				}
-	
-				# Check if page was rotated
-				if ($pg_r[$i]) {
-					print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG;
-					($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\"");
+
+				# Check if page was rotated and extracted with pdftoppm
+				if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) {
+					print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG;
+					($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\"");
 					if ($DEBUG) { 
 						print "\t\t\t${image} -> $cmd: $exit\n";
 						print "\t\t\t\t$_" for @out ;
 						print "\t\t\t\t$_" for @err ;
 					};
 				}
-
+	
 				# Filter ppm images, if needed
 
 				# OCR ppm images to pdf pages
-				($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf");
+				($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf");
 				if ($DEBUG) { 
 					print "\t\t\t${image} -> $cmd: $exit\n";
 					print "\t\t\t\t$_" for @out ;
 					print "\t\t\t\t$_" for @err ;
 				};
+				unlink ("$image") if (!$DEBUG);
 
-				# Scale to fit pdf
-				($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
+				# Scale, crop and rotate to fit pdf
+				($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\"  \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
 				if ($DEBUG) { 
 					print "\t\t\t${image} -> $cmd: $exit\n";
 					print "\t\t\t\t$_" for @out ;
 					print "\t\t\t\t$_" for @err ;
 				};
+				unlink ("$image.pdf") if (!$DEBUG);
 
+				if (defined $pg_crop_x1[$i]) {
+					# adjust cropbox
+					($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = (
+						($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]),
+						($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]),
+						abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i])
+					);
+
+					($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
+					if ($DEBUG) {
+                                        	print "\t\t\t${image} -> $cmd: $exit\n";
+	                                        print "\t\t\t\t$_" for @out ;
+	                                        print "\t\t\t\t$_" for @err ;
+        	                        };
+				}
+
+				if ($pg_r[$i]) {
+					($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
+					if ($DEBUG) { 
+						print "\t\t\t${image} -> $cmd: $exit\n";
+						print "\t\t\t\t$_" for @out ;
+						print "\t\t\t\t$_" for @err ;
+					};
+				}
 
-				unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG);
-				unlink ("$image.pdf") if (!$DEBUG);
-				move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG);
-				unlink ("$image") if (!$DEBUG);
 			}
-			exit 0;
+			exit 1;
 		}
 	}
 
@@ -427,28 +516,51 @@ sub ocr {
 
 	if (scalar @new_pages != $pages) {
 		print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
-		syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG);
+		syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG);
 		unlink "$in_file.$host.tmp";
+		make_path ($error_path) if ( ! -d $error_path);
 		move ("$in_file.$host.processing", $error_file);
-		exit (0);
+		exit (1);
 	}
 
-	# Merge resulting pdf pages to a single pdf
+	# Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output
 	make_path ($out_path) if ( ! -d $out_path);
 	unlink $out_file if ( -f $out_file );
-	($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress");
+
+	chdir (${tmpdir});
+	($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\"  pg_*-cpdf.pdf ");
 	if ($DEBUG) {
 		print "\t\t${out_file} -> $cmd: $exit\n";
 	        print "\t\t\t$_" for @out ;
         	print "\t\t\t$_" for @err ;
 	};
+	if ($exit) {
+		unlink "$in_file.$host.tmp";
+		unlink $out_file;
+		make_path ($error_path) if ( ! -d $error_path);
+                move ("$in_file.$host.processing", $error_file);
+		print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
+                syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG);
+                exit (1);
+        }
+	chdir ("/"); 
+
+	if (!copy (${tmp_file}, $out_file)) {
+                remove_tree ($tmpdir,{ error=> \my $dumb });
+                unlink ("$in_file.$host.tmp");
+		unlink $out_file;
+		make_path ($error_path) if ( ! -d $error_path);
+                move ("$in_file.$host.processing", $error_file);
+		print "Error: cannot copy temp file to $out_file \n" if $DEBUG;
+		syslog ("error","cannot copy temp file to $out_file") if !$DEBUG;
+		exit 1;
+        };
 
 	make_path ($proc_path) if ( ! -d $proc_path);
 	unlink $proc_file if ( -f $proc_file );
 	move ("$in_file.$host.processing", $proc_file);
 	move ("${out_file}.tmp", ${out_file});
 
-
 	# Remove temp dir
 	remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG);
 	unlink $tmp_file if (!$DEBUG);
@@ -471,7 +583,7 @@ sub is_ocred {
 }	
 
 sub get_pages {
-	my ($in_file, $w, $h, $r) = @_;
+	my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_;
 
 	my $pages=0;
 	my $i=0;
@@ -485,29 +597,35 @@ sub get_pages {
 		($dumb, $pages) = split / {1,}/  if ( $_ =~ /NumberOfPages:/ );
 		($dumb, $i )    = split / {1,}/  if ( $_ =~ /PageMediaNumber:/ );
 		($dumb, @$r[$i-1]) = split / {1,}/  if ( $_ =~ /PageMediaRotation:/ );
-		($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/  if ( $_ =~ /PageMediaDimensions:/ );
+		($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ );
+		($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ );
 	}
 
 	return $pages;
 }
 
 sub get_imgs {
-	my ($in_file, $page_img, $w, $h, $t) = @_;
-        my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc);
+	my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_;
+        my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi );
 
 	my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\"");
+	$i = 0;
 
 	foreach my $line (@lines)  {
                 chomp $line;
 		$line =~ s/^ {1,}//;
-		if ( $line =~  /image|mask/ ) {
-			($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line;
+		if ( $line !~  /^page|^----/ ) {
+			($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line;
 			@$page_img[$page-1]=$i;
 			@$w[$page-1] = $width;
 			@$h[$page-1] = $height;
 			@$t[$page-1] = "rgb"; 	# Default is color
-			@$t[$page-1] = ( $comp == 3 || $bpc >  1 || $enc   eq "jpeg" || $color eq "-"    || $color eq "icc"  ? "rgb"  : @$t[$page-1]); 
 			@$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc   eq "ccitt"|| $color eq "gray" ||  $type eq "mask" ? "gray" : @$t[$page-1]); 
+			@$t[$page-1] = ( $comp == 3 || $bpc >  1 || $enc   eq "jpeg" || $color eq "-"    || $color eq "icc"  ? "rgb"  : @$t[$page-1]); 
+			@$t[$page-1] = ( $type eq "stencil" ? $type :  @$t[$page-1]);
+			@$t[$page-1] = ( $enc  eq "image"   ? $enc  :  @$t[$page-1]);
+			@$x_ppi[$page-1] = $xppi;
+			@$y_ppi[$page-1] = $yppi;
 		}
         }
 	return $i+1;
@@ -542,6 +660,19 @@ sub get_res {
 	return ($res_x,$res_y);
 }
 
+sub get_sign {
+        my ($in_file) = @_;
+        my @lines = `${PDFSIG} \"${in_file}\"  2>/dev/null`;
+
+        foreach (@lines)  {
+                chomp;
+                if ( $_ =~ /^Signature/ ) {
+			return 1;
+		}
+        }
+        return 0;
+}
+
 sub is_locked_ex {
     my ($path) = @_;
 
diff --git a/workflow.pdf b/workflow.pdf
index 184c7a2..0bc27de 100644
Binary files a/workflow.pdf and b/workflow.pdf differ
diff --git a/workflow.vsd b/workflow.vsd
index b36e28c..17406f7 100644
Binary files a/workflow.vsd and b/workflow.vsd differ
--
libgit2 0.21.2