Commit d2b74559c9a3595de14ad8ce6ba843d92f505213

Authored by Nei Jobson da Costa Carneiro
1 parent 78ec197b

Pré versão 2.0 a ser liberada - PARA O CANTONI INCLUIR O Container Docker

INSTALL.txt
@@ -1,202 +0,0 @@ @@ -1,202 +0,0 @@
1 -# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees  
2 -#  
3 -# This script monitors a set of input directories for PDF files  
4 -# once a new file is detected, it is processes through tesseract OCR  
5 -# in order to generate a new file with a hidden searchable text layer  
6 -#  
7 -# It may be distributed under the conditions of the LGPL v2.1 license.  
8 -#  
9 -# Author: Guilherme Chehab  
10 -#  
11 -# Version History:  
12 -# 0.1 Initial single server version  
13 -# 0.2 Check if page already has the html hidden layer, if so, ignore it  
14 -# 0.3 Solved issues about various image enconding types  
15 -# 0.4 Added a postnormalization step to ensure all output pdf pages have  
16 -# the same size and orientations as the original files  
17 -# 0.5 Used input file renaming as a way to sync multiple parallel instances,  
18 -# that way, it is minimized the risk of same file being OCRed multiple times.  
19 -# 0.6 Added a default handler for unknown image encoding using jpeg encoding  
20 -# 0.7 Solved an issue with files with more than 1000 pages  
21 -# 1.0 First release version  
22 -# 1.0.1 Solving error when file has no images  
23 -# 1.0.2 Fix bug when counting cores for AMD processors  
24 -# 1.0.3 Added better image type detection  
25 -# 1.0.4 Fix: added ubuntu init script  
26 -# 1.0.4b Centos 6.9  
27 -#  
28 -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it  
29 -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them  
30 -# diferently but does not treat it adequately  
31 -# - Review poppler and cpdf install instructions  
32 -# - Add better handling of vectorized and non scanned pdf files  
33 -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)  
34 -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible  
35 -#  
36 -# Check software requirements on the comments bellow  
37 -#  
38 -# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables  
39 -#  
40 -#  
41 -# O servidor OCR depende dos seguintes componentes:  
42 -# - Perl 5.10.1, com seguintes módulos:  
43 -# - File::Find::Rule  
44 -# - File::Basename  
45 -# - File::Copy  
46 -# - File::Path  
47 -# - File::Touch  
48 -# - Sys::Syslog  
49 -# - Sys::Hostname  
50 -# - IPC::Open3  
51 -# - IO::Select  
52 -# - POSIX  
53 -# - Tesseract-ocr 3.05, com dicionários inglês e português  
54 -# - Pdftk 2.02  
55 -# - Poppler-utils 0.42.0  
56 -# - Cpdf 2.1  
57 -# - ImageMagick 6.7.2-7  
58 -#  
59 -# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema  
60 -# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.  
61 -# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.  
62 -#  
63 -## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.  
64 -#  
65 -# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':  
66 -#  
67 -# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script  
68 -# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro  
69 -# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)  
70 -# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)  
71 -# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.  
72 -# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.  
73 -# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.  
74 -#  
75 -# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.  
76 -#  
77 -#  
78 -# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root)  
79 -#  
80 -#  
81 -# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS  
82 -#  
83 -# RedHat 6.7 e Centos 6.9:  
84 -yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip  
85 -yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel  
86 -cd /tmp  
87 -wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm  
88 -rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm  
89 -rm -f msttcore-fonts-2.0-3.noarch.rpm  
90 -  
91 -# Centos 6.9  
92 -# \_ autoconf-archive  
93 -wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm  
94 -rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm  
95 -rm autoconf-archive-2012.04.07-7.3.noarch.rpm  
96 -# \_ GCC 4.8  
97 -wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo  
98 -yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj  
99 -  
100 -# Ubuntu 14.04 Server:  
101 -apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14  
102 -apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev  
103 -apt-get install ttf-mscorefonts-installer  
104 -  
105 -# Ambas plataformas:  
106 -cd /usr/local/src  
107 -  
108 -for i in \  
109 - https://github.com/tesseract-ocr/langdata.git \  
110 - https://github.com/DanBloomberg/leptonica.git \  
111 - https://github.com/libav/libav.git \  
112 - https://github.com/tesseract-ocr/tessdata.git \  
113 - https://github.com/tesseract-ocr/tesseract.git \  
114 - git://git.freedesktop.org/git/poppler/poppler.git \  
115 - git://git.freedesktop.org/git/poppler/test.git \  
116 - https://github.com/Flameeyes/unpaper.git \  
117 - https://github.com/ocaml/ocaml.git \  
118 - https://gitlab.camlcity.org/gerd/lib-findlib.git \  
119 - https://github.com/johnwhitington/camlpdf.git \  
120 - https://github.com/johnwhitington/cpdf-source.git \  
121 -; do git clone $i; done  
122 -  
123 -wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip  
124 -unzip pdftk-2.02-src.zip  
125 -rm -f pdftk-2.02-src.zip  
126 -  
127 -# pdftk, versão 2.02 ou superior  
128 -cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..  
129 -  
130 -# Centos 6.9  
131 -# \_ Cria um novo shell usando o GCC 4.8 por default  
132 -scl enable devtoolset-2 bash  
133 -  
134 -# Tesseract, versão 3.05-dev ou superior  
135 -# Bibliotecas para o Tesseract: Leptonica e Libav  
136 -cd leptonica && ./autobuild && ./configure && make all install && cd ..  
137 -  
138 -# Para compilação do Tesseract após a compilação do leptonica  
139 -export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/  
140 -  
141 -cd libav && ./configure --enable-sram && make all install && cd ..  
142 -  
143 -# Tesseract  
144 -cd tesseract && ./autogen.sh && ./configure && make all install && cd ..  
145 -cp -avR tessdata/* /usr/local/share/tessdata/  
146 -  
147 -# cpdf, versão 2.1 ou superior  
148 -cd ocaml && ./configure && make world.opt && make install && cd ..  
149 -mkdir -p /usr/local/man/man5  
150 -# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente  
151 -cd lib-findlib && ./configure && make all && make install && cd ..  
152 -cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..  
153 -cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..  
154 -  
155 -# poppler-utils, versão 0.42.0 ou superior  
156 -cd poppler && ./autogen.sh && ./configure && make all install && cd ..  
157 -  
158 -# Centos 6.9  
159 -# \_ Termina o shell usando o GCC 4.8 por default  
160 -exit  
161 -  
162 -# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root)  
163 -  
164 -## Comandos adicionais para configuração do módulo:  
165 -  
166 -# Criação do usuário  
167 -adduser ocr  
168 -  
169 -# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional  
170 -cp ./usr/local/bin/ocr /usr/local/bin  
171 -  
172 -# Auto start (RedHat 6.7 e CentOs 6.9)  
173 -cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr  
174 -mv /etc  
175 -chkconfig --add ocr  
176 -chkconfig --level 2345 ocr on  
177 -  
178 -# Auto start (Ubuntu 14.04)  
179 -cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr  
180 -update-rd.d ocr defaults  
181 -  
182 -# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações  
183 -cd /home/ocr  
184 -tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr  
185 -su  
186 -  
187 -# Copie o pacote para os outros servidores e extraia com:  
188 -cd /  
189 -tar xovzf pkg-ocr.tgz  
190 -  
191 -# Instalando pré-requisitos RUNTIME em servidores adicionais  
192 -  
193 -# Redhat 6.7 e CentOS 6.9  
194 -yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp  
195 -yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext  
196 -  
197 -# Ubuntu 14.04  
198 -apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14  
199 -apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0  
200 -  
201 -# Inicie o serviço com  
202 -service ocr start  
README.md 0 → 100644
@@ -0,0 +1,240 @@ @@ -0,0 +1,240 @@
  1 +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees
  2 +
  3 +This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer
  4 +
  5 +It may be distributed under the conditions of the LGPL v2.1 license.
  6 +
  7 +Author: Guilherme Chehab
  8 +
  9 +## Version History:
  10 + - 0.1
  11 + - Initial single server version
  12 + - 0.2
  13 + - Check if page already has the html hidden layer, if so, ignore it
  14 + - 0.3
  15 + - Solved issues about various image enconding types
  16 + - 0.4
  17 + - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files
  18 + - 0.5
  19 + - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times.
  20 + - 0.6
  21 + - Added a default handler for unknown image encoding using jpeg encoding
  22 + - 0.7
  23 + - Solved an issue with files with more than 1000 pages
  24 + - 1.0
  25 + - First release version
  26 + - 1.0.1 Solving error when file has no images
  27 + - 1.0.2 Fix bug when counting cores for AMD processors
  28 + - 1.0.3 Added better image type detection
  29 + - 1.0.4 Fix: added ubuntu init script
  30 + - 1.0.4b Add Centos 6.9 install instructions
  31 + - 2.0
  32 + - PDF/A output, and better compression with ghostscript
  33 + - Rewritten image extration, processing and transformations process
  34 + - Check if input file is signed, in this case, does not change the file contents
  35 + - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
  36 + - Use operating system packges by default
  37 + - Changed paths from external programs, instead of using full paths, uses first match from $PATH
  38 + - Check existence of external programs on path before running
  39 + - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
  40 + - Fix: create subpaths on error folder
  41 + - Fix: trying to reduce overhead on temporary folder
  42 + - TODO:
  43 + - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
  44 + - Review poppler and cpdf install instructions
  45 + - Add better handling of vectorized and non scanned pdf files
  46 + - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers
  47 + - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W.
  48 + - Move all parameters to config file
  49 + - Add some job control web interface
  50 + - Add end user interface to submit files through web
  51 + - Add check external programs version requirements before running
  52 + - BUGS:
  53 + - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages
  54 +
  55 +## Check software requirements on the comments bellow
  56 +
  57 +To configure input dirs change @BASE_DIRS and @SUB_DIRS variables
  58 +
  59 +### O servidor OCR depende dos seguintes componentes:
  60 + - Perl 5.10.1, com seguintes módulos:
  61 + - File::Find::Rule
  62 + - File::Basename
  63 + - File::Copy
  64 + - File::Path
  65 + - File::Touch
  66 + - Sys::Syslog
  67 + - Sys::Hostname
  68 + - IPC::Open3
  69 + - IO::Select
  70 + - POSIX
  71 + - Tesseract-ocr 3.05, com dicionários inglês e português
  72 + - Pdftk 2.02
  73 + - Poppler-utils 0.42.0
  74 + - Cpdf 2.1
  75 + - ImageMagick 6.7.2-7
  76 + - Ghostcript 9.18
  77 +
  78 +Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
  79 +
  80 +Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
  81 +
  82 +Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
  83 +
  84 +ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
  85 +
  86 +### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
  87 +
  88 +- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script
  89 +- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
  90 +- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
  91 +- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
  92 +
  93 +Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
  94 +
  95 +Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
  96 +
  97 +A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
  98 +
  99 +Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
  100 +
  101 +# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root)
  102 +
  103 +Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial)
  104 +são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada
  105 +
  106 +Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries
  107 +
  108 +## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS
  109 +
  110 +### RedHat 6.7 e Centos 6.9:
  111 + yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
  112 + yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel
  113 + cd /tmp
  114 + wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
  115 + rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
  116 + rm -f msttcore-fonts-2.0-3.noarch.rpm
  117 +
  118 +### Centos 6.9
  119 +# \_ autoconf-archive
  120 + wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
  121 + rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
  122 + rm autoconf-archive-2012.04.07-7.3.noarch.rpm
  123 +# \_ GCC 4.8
  124 + wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
  125 + yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
  126 +
  127 +# Ubuntu 14.04 Server:
  128 + apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14
  129 + apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev
  130 + apt-get install ttf-mscorefonts-installer
  131 +
  132 +# Ambas plataformas:
  133 + cd /usr/local/src
  134 +
  135 + for i in \
  136 + https://github.com/tesseract-ocr/langdata.git \
  137 + https://github.com/DanBloomberg/leptonica.git \
  138 + https://github.com/libav/libav.git \
  139 + https://github.com/tesseract-ocr/tessdata.git \
  140 + https://github.com/tesseract-ocr/tesseract.git \
  141 + git://git.freedesktop.org/git/poppler/poppler.git \
  142 + git://git.freedesktop.org/git/poppler/test.git \
  143 + https://github.com/Flameeyes/unpaper.git \
  144 + https://github.com/ocaml/ocaml.git \
  145 + https://gitlab.camlcity.org/gerd/lib-findlib.git \
  146 + https://github.com/johnwhitington/camlpdf.git \
  147 + https://github.com/johnwhitington/cpdf-source.git \
  148 + http://git.ghostscript.com/ghostpdl.git \
  149 + ; do git clone $i; done
  150 +
  151 + wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
  152 + unzip pdftk-2.02-src.zip
  153 + rm -f pdftk-2.02-src.zip
  154 +
  155 +# pdftk, versão 2.02 ou superior
  156 +cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
  157 +
  158 +# Ghostscript 9.18 ou superior
  159 +#wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz
  160 +#tar xvozf ghostscript-9.21.tar.gz
  161 +#rm -f ghostscript-9.21.tar.gz
  162 +#cd ghostscript-9.21
  163 +cd ghostpdl
  164 +./autogen.sh; ./configure
  165 +make all install
  166 +cd ..
  167 +
  168 +# Centos 6.9
  169 +# \_ Cria um novo shell usando o GCC 4.8 por default
  170 +scl enable devtoolset-2 bash
  171 +
  172 +# Tesseract, versão 3.05-dev ou superior
  173 +# Bibliotecas para o Tesseract: Leptonica e Libav
  174 +cd leptonica && ./autobuild && ./configure && make all install && cd ..
  175 +
  176 +# Para compilação do Tesseract após a compilação do leptonica
  177 +export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
  178 +
  179 +cd libav && ./configure --enable-sram && make all install && cd ..
  180 +
  181 +# Tesseract
  182 +cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
  183 +cp -avR tessdata/* /usr/local/share/tessdata/
  184 +
  185 +# cpdf, versão 2.1 ou superior
  186 +cd ocaml && ./configure && make world.opt && make install && cd ..
  187 +mkdir -p /usr/local/man/man5
  188 +# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente
  189 +cd lib-findlib && ./configure && make all && make install && cd ..
  190 +cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
  191 +cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
  192 +
  193 +# poppler-utils, versão 0.42.0 ou superior
  194 +cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd ..
  195 +
  196 +# Centos 6.9
  197 +# \_ Termina o shell usando o GCC 4.8 por default
  198 +exit
  199 +
  200 +# ----------------------- INSTALAÇÃO (obs.: os comandos devem ser executados como root)
  201 +
  202 +## Comandos adicionais para configuração do módulo:
  203 +
  204 +# Criação do usuário
  205 +adduser ocr
  206 +
  207 +# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
  208 +cp ./usr/local/bin/ocr /usr/local/bin
  209 +
  210 +# Auto start (RedHat 6.7 e CentOs 6.9)
  211 +cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr
  212 +mv /etc
  213 +chkconfig --add ocr
  214 +chkconfig --level 2345 ocr on
  215 +
  216 +# Auto start (Ubuntu 14.04)
  217 +cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
  218 +update-rd.d ocr defaults
  219 +
  220 +# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
  221 +cd /home/ocr
  222 +tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
  223 +su
  224 +
  225 +# Copie o pacote para os outros servidores e extraia com:
  226 +cd /
  227 +tar xovzf pkg-ocr.tgz
  228 +
  229 +# Instalando pré-requisitos RUNTIME em servidores adicionais
  230 +
  231 +# Redhat 6.7 e CentOS 6.9
  232 +yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript
  233 +yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext
  234 +
  235 +# Ubuntu 14.04
  236 +apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14
  237 +apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript
  238 +
  239 +# Inicie o serviço com
  240 +service ocr start
usr/local/bin/ocr
1 -#! /usr/bin/perl -w 1 +#!/usr/bin/perl -w
2 # 2 #
3 -# OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes 3 +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes
4 # 4 #
5 # This script monitors a set of input directories for PDF files 5 # This script monitors a set of input directories for PDF files
6 # once a new file is detected, it is processes through tesseract OCR 6 # once a new file is detected, it is processes through tesseract OCR
@@ -24,15 +24,38 @@ @@ -24,15 +24,38 @@
24 # 1.0.1 Solving error when file has no images 24 # 1.0.1 Solving error when file has no images
25 # 1.0.2 Fix bug when counting cores for AMD processors 25 # 1.0.2 Fix bug when counting cores for AMD processors
26 # 1.0.3 Added better image type detection 26 # 1.0.3 Added better image type detection
27 -# 1.0.4 Fix: added ubuntu init script 27 +# 1.0.4 Fix: added ubuntu init script
  28 +# 1.0.4b Add Centos 6.9 install instructions
  29 +# 2.0 PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is
  30 +# strongly recomended
  31 +# Rewritten image extration, processing and transformations process
  32 +# Check if input file is signed, in this case, does not change the file contents
  33 +# Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
  34 +# Use operating system packges by default
  35 +# Changed paths from external programs, instead of using full paths, uses first match from $PATH
  36 +# Check existence of external programs on path before running
  37 +# Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
  38 +# Fix: create subpaths on error folder
  39 +# Fix: trying to reduce overhead on temporary folder
28 # 40 #
29 # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it 41 # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
30 -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them  
31 -# diferently but does not treat it adequately 42 +# would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them
  43 +# diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
32 # - Review poppler and cpdf install instructions 44 # - Review poppler and cpdf install instructions
33 # - Add better handling of vectorized and non scanned pdf files 45 # - Add better handling of vectorized and non scanned pdf files
34 -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)  
35 -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible 46 +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current
  47 +# scalling, cropping and rotation handlers
  48 +# - Check mean saturation for additional colored images detection and automatically convert to B&W if possible --
  49 +# added function to analyse image color histogram -> just need to add option to convert it to B&W.
  50 +# - Move all parameters to config file
  51 +# - Add some job control web interface
  52 +# - Add end user interface to submit files through web
  53 +# - Add check external programs version requirements before running
  54 +#
  55 +# BUGS: - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than
  56 +# original, this is due to using pdftoppm instead of pdfimages
  57 +# - Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions
  58 +# increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server
36 # 59 #
37 # Check software requirements on the comments bellow 60 # Check software requirements on the comments bellow
38 # 61 #
@@ -54,8 +77,8 @@ use Sys::Hostname; @@ -54,8 +77,8 @@ use Sys::Hostname;
54 use IPC::Open3; 77 use IPC::Open3;
55 use IO::Select; 78 use IO::Select;
56 79
57 -my $DEBUG = 0;  
58 -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); 80 +my $DEBUG = 2;
  81 +my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`);
59 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; 82 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ;
60 83
61 my $USER = 'ocr'; 84 my $USER = 'ocr';
@@ -63,23 +86,28 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca @@ -63,23 +86,28 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca
63 86
64 # Command dependencies 87 # Command dependencies
65 88
66 -# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher  
67 -my $TESSERACT = '/usr/local/bin/tesseract -l por+eng'; 89 +# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended
  90 +my $TESSERACT = 'tesseract --oem 0'; # if Tesseract => 4.0
  91 +#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0
68 92
69 # Depends on pdftk 2.02 or higher 93 # Depends on pdftk 2.02 or higher
70 -my $PDFTK = '/usr/local/bin/pdftk'; 94 +my $PDFTK = 'pdftk';
71 95
72 # Depends on poppler-utils 0.42.0 or higher 96 # Depends on poppler-utils 0.42.0 or higher
73 -#my $PDINFO = '/usr/local/bin/pdfinfo';  
74 -my $PDFFONTS = '/usr/local/bin/pdffonts';  
75 -my $PDFIMAGES = '/usr/local/bin/pdfimages';  
76 -my $PDFTOPPM = '/usr/local/bin/pdftoppm'; 97 +my $PDFFONTS = 'pdffonts';
  98 +my $PDFIMAGES = 'pdfimages';
  99 +my $PDFTOPPM = 'pdftoppm';
  100 +my $PDFUNITE = 'pdfunite';
  101 +my $PDFSIG = 'pdfsig';
77 102
78 # Depends on cpdf 2.1 or higher 103 # Depends on cpdf 2.1 or higher
79 -my $CPDF = '/usr/local/bin/cpdf'; 104 +my $CPDF = 'cpdf';
  105 +
  106 +# Depends on Ghostscript 9.18
  107 +my $GS = 'gs';
80 108
81 ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner 109 ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner
82 -my $CONVERT = '/usr/bin/convert'; 110 +my $CONVERT = 'convert';
83 111
84 # If it is needed further filtering 112 # If it is needed further filtering
85 #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; 113 #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';
@@ -90,12 +118,14 @@ my @BASE_DIRS = ( &#39;/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/&#39;, @@ -90,12 +118,14 @@ my @BASE_DIRS = ( &#39;/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/&#39;,
90 my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); 118 my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' );
91 119
92 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); 120 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2);
93 -%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); 121 +%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2);
94 122
95 # Safeguard im case of cpuinfo has not identified correctly the number of CPUs 123 # Safeguard im case of cpuinfo has not identified correctly the number of CPUs
96 $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; 124 $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS;
97 125
98 -$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; 126 +$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin';
  127 +$ENV{'IFS'} = '\t\n';
  128 +
99 my ($host) = split/\./,hostname; 129 my ($host) = split/\./,hostname;
100 130
101 use vars qw/*name *dir *prune/; 131 use vars qw/*name *dir *prune/;
@@ -107,14 +137,15 @@ sub main; @@ -107,14 +137,15 @@ sub main;
107 sub get_pages; 137 sub get_pages;
108 sub get_rotation; 138 sub get_rotation;
109 sub get_res; 139 sub get_res;
110 -sub is_ocred;  
111 sub is_locked_ex; 140 sub is_locked_ex;
112 141
113 142
114 my $expr = 'use POSIX qw(setsid)'; 143 my $expr = 'use POSIX qw(setsid)';
115 144
116 my ($dumb1, $dumb2, $uid) = getpwnam ($USER); 145 my ($dumb1, $dumb2, $uid) = getpwnam ($USER);
117 -setuid ($uid) or warn "Cant set uid $uid"; 146 +if (defined $uid) {
  147 + setuid ($uid) or warn "Cant set uid $uid";
  148 +}
118 149
119 $SIG{__DIE__} = 'DEFAULT'; 150 $SIG{__DIE__} = 'DEFAULT';
120 $SIG{__WARN__} = \&die_when_called; 151 $SIG{__WARN__} = \&die_when_called;
@@ -126,6 +157,11 @@ if ($@) { @@ -126,6 +157,11 @@ if ($@) {
126 chdir('/') or die "$0: cannot chdir '/': $!\n"; 157 chdir('/') or die "$0: cannot chdir '/': $!\n";
127 open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; 158 open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n";
128 159
  160 +foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) {
  161 + die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0);
  162 +}
  163 +
  164 +
129 foreach my $DIR (@BASE_DIRS) { 165 foreach my $DIR (@BASE_DIRS) {
130 166
131 defined(my $pid = fork) or die "$0: cannot fork: $!\n"; 167 defined(my $pid = fork) or die "$0: cannot fork: $!\n";
@@ -135,7 +171,7 @@ foreach my $DIR (@BASE_DIRS) { @@ -135,7 +171,7 @@ foreach my $DIR (@BASE_DIRS) {
135 main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); 171 main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR});
136 exit 0; 172 exit 0;
137 last; 173 last;
138 - } 174 + }
139 } 175 }
140 176
141 exit 0; 177 exit 0;
@@ -157,7 +193,7 @@ sub main { @@ -157,7 +193,7 @@ sub main {
157 # remove .tmp file 193 # remove .tmp file
158 unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); 194 unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) );
159 195
160 - # Rename files that were in 'processig' back 196 + # Rename files that were in 'processing' state back
161 foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { 197 foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) {
162 my $old_name = $file; 198 my $old_name = $file;
163 $old_name =~ s/\.${host}\.processing$//g; 199 $old_name =~ s/\.${host}\.processing$//g;
@@ -177,12 +213,14 @@ sub main { @@ -177,12 +213,14 @@ sub main {
177 # Main loop 213 # Main loop
178 while ( 1 ) { 214 while ( 1 ) {
179 select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced 215 select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced
  216 +
180 $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); 217 $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} ));
181 print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; 218 print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in;
182 $count = scalar keys %files_in; 219 $count = scalar keys %files_in;
183 - foreach my $file (keys %files_in) {  
184 220
185 - next if ( glob ("$file.*.tmp")); 221 + foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) {
  222 +
  223 + next if ( glob ("\"$file.*.tmp\""));
186 224
187 select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds 225 select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds
188 next if (!defined $files_in{$file}); # continue only if it is still valid 226 next if (!defined $files_in{$file}); # continue only if it is still valid
@@ -255,7 +293,7 @@ sub ocr { @@ -255,7 +293,7 @@ sub ocr {
255 remove_tree ($tmpdir,{ error=> \my $dumb }); 293 remove_tree ($tmpdir,{ error=> \my $dumb });
256 unlink ("$in_file.$host.tmp"); 294 unlink ("$in_file.$host.tmp");
257 move ( "$in_file.$host.processing", $in_file); 295 move ( "$in_file.$host.processing", $in_file);
258 - exit 0; 296 + exit 1;
259 }; 297 };
260 298
261 my $out_path = $in_path; 299 my $out_path = $in_path;
@@ -271,7 +309,7 @@ sub ocr { @@ -271,7 +309,7 @@ sub ocr {
271 my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); 309 my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: "");
272 310
273 print "\twritting to $out_file\n" if $DEBUG; 311 print "\twritting to $out_file\n" if $DEBUG;
274 - 312 +
275 my $stime = time; 313 my $stime = time;
276 my %pids; 314 my %pids;
277 315
@@ -291,8 +329,26 @@ sub ocr { @@ -291,8 +329,26 @@ sub ocr {
291 remove_tree ($tmpdir,{ error=> \my $dumb }); 329 remove_tree ($tmpdir,{ error=> \my $dumb });
292 unlink ("$in_file.$host.tmp"); 330 unlink ("$in_file.$host.tmp");
293 move ( "$in_file.$host.processing", $in_file); 331 move ( "$in_file.$host.processing", $in_file);
  332 + print "Error: cannot copy $in_file to temp dir \n" if $DEBUG;
  333 + syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG;
  334 + exit 1;
294 }; 335 };
295 336
  337 + # Check if file was signed
  338 + if (get_sign($tmp_file)) {
  339 + if (!copy ("$in_file.$host.processing", $proc_file)) {
  340 + remove_tree ($tmpdir,{ error=> \my $dumb });
  341 + unlink ("$in_file.$host.tmp");
  342 + move ( "$in_file.$host.processing", $in_file);
  343 + };
  344 + move ("$in_file.$host.processing", $out_file);
  345 + unlink ("$in_file.$host.tmp");
  346 + print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG;
  347 + syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG;
  348 +
  349 + exit 0;
  350 + }
  351 +
296 # Extract pages 352 # Extract pages
297 ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); 353 ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf");
298 if ($DEBUG) { 354 if ($DEBUG) {
@@ -301,12 +357,13 @@ sub ocr { @@ -301,12 +357,13 @@ sub ocr {
301 print "\t\t\t$_" for @err ; 357 print "\t\t\t$_" for @err ;
302 }; 358 };
303 359
  360 + my ($pages, @pg_w, @pg_h, @pg_r, @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2);
  361 + $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2);
304 362
305 - my ($pages, @pg_w, @pg_h, @pg_r);  
306 - $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r); 363 + my ($imgs,@page_img, @img_w, @img_h, @img_t, @img_xppi, @img_yppi);
  364 + $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi);
307 365
308 - my ($imgs,@page_img, @img_w, @img_h, @img_t);  
309 - $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t); 366 + unlink ($tmp_file) if (!$DEBUG);
310 367
311 for ( my $i=0; $i< $pages; $i++ ) { 368 for ( my $i=0; $i< $pages; $i++ ) {
312 my $pg = sprintf ("pg_%06d", $i+1); 369 my $pg = sprintf ("pg_%06d", $i+1);
@@ -333,25 +390,29 @@ sub ocr { @@ -333,25 +390,29 @@ sub ocr {
333 if (! defined $img_t[$i] ) { 390 if (! defined $img_t[$i] ) {
334 move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); 391 move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
335 print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; 392 print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG;
336 - exit 0; 393 + exit -1;
337 } 394 }
338 395
339 - print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG; 396 + print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG;
  397 + print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG);
  398 + print "\n" if ($DEBUG);
340 399
  400 + # Extract images from page, since 2.0 uses png lossless format regardless of original format or depth
341 undef $cmd; 401 undef $cmd;
342 402
343 - if ($img_t[$i] eq "gray") {  
344 - $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; 403 + # Use PDFIMAGES and JPEG by default
  404 + $cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
  405 +
  406 + if ($img_t[$i] eq "stencil") {
  407 + $cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
345 } 408 }
346 409
347 - if ($img_t[$i] eq "rgb") {  
348 - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";  
349 - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM 410 + if ($img_t[$i] eq "gray") {
  411 + $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
350 } 412 }
351 413
352 - if (!defined $cmd) {  
353 - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";  
354 - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM 414 + if ($img_t[$i] !~ /gray|rgb|stencil/) {
  415 + $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
355 } 416 }
356 417
357 ($exit,$cmd,@out,@err) = exec_cmd($cmd); 418 ($exit,$cmd,@out,@err) = exec_cmd($cmd);
@@ -362,7 +423,13 @@ sub ocr { @@ -362,7 +423,13 @@ sub ocr {
362 }; 423 };
363 424
364 # Process each resulting image for page pdf 425 # Process each resulting image for page pdf
365 - my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ; 426 + my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ;
  427 +
  428 + if (scalar @images == 0) {
  429 + move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
  430 + print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG;
  431 + exit 0;
  432 + }
366 433
367 foreach my $image (@images) { 434 foreach my $image (@images) {
368 print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; 435 print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG;
@@ -378,43 +445,65 @@ sub ocr { @@ -378,43 +445,65 @@ sub ocr {
378 print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; 445 print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n";
379 } 446 }
380 } 447 }
381 -  
382 - # Check if page was rotated  
383 - if ($pg_r[$i]) {  
384 - print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG;  
385 - ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\""); 448 +
  449 + # Check if page was rotated and extracted with pdftoppm
  450 + if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) {
  451 + print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG;
  452 + ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\"");
386 if ($DEBUG) { 453 if ($DEBUG) {
387 print "\t\t\t${image} -> $cmd: $exit\n"; 454 print "\t\t\t${image} -> $cmd: $exit\n";
388 print "\t\t\t\t$_" for @out ; 455 print "\t\t\t\t$_" for @out ;
389 print "\t\t\t\t$_" for @err ; 456 print "\t\t\t\t$_" for @err ;
390 }; 457 };
391 } 458 }
392 - 459 +
393 # Filter ppm images, if needed 460 # Filter ppm images, if needed
394 461
395 # OCR ppm images to pdf pages 462 # OCR ppm images to pdf pages
396 - ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf"); 463 + ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf");
397 if ($DEBUG) { 464 if ($DEBUG) {
398 print "\t\t\t${image} -> $cmd: $exit\n"; 465 print "\t\t\t${image} -> $cmd: $exit\n";
399 print "\t\t\t\t$_" for @out ; 466 print "\t\t\t\t$_" for @out ;
400 print "\t\t\t\t$_" for @err ; 467 print "\t\t\t\t$_" for @err ;
401 }; 468 };
  469 + unlink ("$image") if (!$DEBUG);
402 470
403 - # Scale to fit pdf  
404 - ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); 471 + # Scale, crop and rotate to fit pdf
  472 + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
405 if ($DEBUG) { 473 if ($DEBUG) {
406 print "\t\t\t${image} -> $cmd: $exit\n"; 474 print "\t\t\t${image} -> $cmd: $exit\n";
407 print "\t\t\t\t$_" for @out ; 475 print "\t\t\t\t$_" for @out ;
408 print "\t\t\t\t$_" for @err ; 476 print "\t\t\t\t$_" for @err ;
409 }; 477 };
  478 + unlink ("$image.pdf") if (!$DEBUG);
410 479
  480 + if (defined $pg_crop_x1[$i]) {
  481 + # adjust cropbox
  482 + ($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = (
  483 + ($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]),
  484 + ($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]),
  485 + abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i])
  486 + );
  487 +
  488 + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
  489 + if ($DEBUG) {
  490 + print "\t\t\t${image} -> $cmd: $exit\n";
  491 + print "\t\t\t\t$_" for @out ;
  492 + print "\t\t\t\t$_" for @err ;
  493 + };
  494 + }
  495 +
  496 + if ($pg_r[$i]) {
  497 + ($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
  498 + if ($DEBUG) {
  499 + print "\t\t\t${image} -> $cmd: $exit\n";
  500 + print "\t\t\t\t$_" for @out ;
  501 + print "\t\t\t\t$_" for @err ;
  502 + };
  503 + }
411 504
412 - unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG);  
413 - unlink ("$image.pdf") if (!$DEBUG);  
414 - move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG);  
415 - unlink ("$image") if (!$DEBUG);  
416 } 505 }
417 - exit 0; 506 + exit 1;
418 } 507 }
419 } 508 }
420 509
@@ -427,28 +516,51 @@ sub ocr { @@ -427,28 +516,51 @@ sub ocr {
427 516
428 if (scalar @new_pages != $pages) { 517 if (scalar @new_pages != $pages) {
429 print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); 518 print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
430 - syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG); 519 + syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG);
431 unlink "$in_file.$host.tmp"; 520 unlink "$in_file.$host.tmp";
  521 + make_path ($error_path) if ( ! -d $error_path);
432 move ("$in_file.$host.processing", $error_file); 522 move ("$in_file.$host.processing", $error_file);
433 - exit (0); 523 + exit (1);
434 } 524 }
435 525
436 - # Merge resulting pdf pages to a single pdf 526 + # Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output
437 make_path ($out_path) if ( ! -d $out_path); 527 make_path ($out_path) if ( ! -d $out_path);
438 unlink $out_file if ( -f $out_file ); 528 unlink $out_file if ( -f $out_file );
439 - ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress"); 529 +
  530 + chdir (${tmpdir});
  531 + ($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\" pg_*-cpdf.pdf ");
440 if ($DEBUG) { 532 if ($DEBUG) {
441 print "\t\t${out_file} -> $cmd: $exit\n"; 533 print "\t\t${out_file} -> $cmd: $exit\n";
442 print "\t\t\t$_" for @out ; 534 print "\t\t\t$_" for @out ;
443 print "\t\t\t$_" for @err ; 535 print "\t\t\t$_" for @err ;
444 }; 536 };
  537 + if ($exit) {
  538 + unlink "$in_file.$host.tmp";
  539 + unlink $out_file;
  540 + make_path ($error_path) if ( ! -d $error_path);
  541 + move ("$in_file.$host.processing", $error_file);
  542 + print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
  543 + syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG);
  544 + exit (1);
  545 + }
  546 + chdir ("/");
  547 +
  548 + if (!copy (${tmp_file}, $out_file)) {
  549 + remove_tree ($tmpdir,{ error=> \my $dumb });
  550 + unlink ("$in_file.$host.tmp");
  551 + unlink $out_file;
  552 + make_path ($error_path) if ( ! -d $error_path);
  553 + move ("$in_file.$host.processing", $error_file);
  554 + print "Error: cannot copy temp file to $out_file \n" if $DEBUG;
  555 + syslog ("error","cannot copy temp file to $out_file") if !$DEBUG;
  556 + exit 1;
  557 + };
445 558
446 make_path ($proc_path) if ( ! -d $proc_path); 559 make_path ($proc_path) if ( ! -d $proc_path);
447 unlink $proc_file if ( -f $proc_file ); 560 unlink $proc_file if ( -f $proc_file );
448 move ("$in_file.$host.processing", $proc_file); 561 move ("$in_file.$host.processing", $proc_file);
449 move ("${out_file}.tmp", ${out_file}); 562 move ("${out_file}.tmp", ${out_file});
450 563
451 -  
452 # Remove temp dir 564 # Remove temp dir
453 remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); 565 remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG);
454 unlink $tmp_file if (!$DEBUG); 566 unlink $tmp_file if (!$DEBUG);
@@ -471,7 +583,7 @@ sub is_ocred { @@ -471,7 +583,7 @@ sub is_ocred {
471 } 583 }
472 584
473 sub get_pages { 585 sub get_pages {
474 - my ($in_file, $w, $h, $r) = @_; 586 + my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_;
475 587
476 my $pages=0; 588 my $pages=0;
477 my $i=0; 589 my $i=0;
@@ -485,29 +597,35 @@ sub get_pages { @@ -485,29 +597,35 @@ sub get_pages {
485 ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); 597 ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ );
486 ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); 598 ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ );
487 ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); 599 ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ );
488 - ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); 600 + ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ );
  601 + ($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ );
489 } 602 }
490 603
491 return $pages; 604 return $pages;
492 } 605 }
493 606
494 sub get_imgs { 607 sub get_imgs {
495 - my ($in_file, $page_img, $w, $h, $t) = @_;  
496 - my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); 608 + my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_;
  609 + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi );
497 610
498 my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); 611 my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\"");
  612 + $i = 0;
499 613
500 foreach my $line (@lines) { 614 foreach my $line (@lines) {
501 chomp $line; 615 chomp $line;
502 $line =~ s/^ {1,}//; 616 $line =~ s/^ {1,}//;
503 - if ( $line =~ /image|mask/ ) {  
504 - ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; 617 + if ( $line !~ /^page|^----/ ) {
  618 + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line;
505 @$page_img[$page-1]=$i; 619 @$page_img[$page-1]=$i;
506 @$w[$page-1] = $width; 620 @$w[$page-1] = $width;
507 @$h[$page-1] = $height; 621 @$h[$page-1] = $height;
508 @$t[$page-1] = "rgb"; # Default is color 622 @$t[$page-1] = "rgb"; # Default is color
509 - @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]);  
510 @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); 623 @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]);
  624 + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]);
  625 + @$t[$page-1] = ( $type eq "stencil" ? $type : @$t[$page-1]);
  626 + @$t[$page-1] = ( $enc eq "image" ? $enc : @$t[$page-1]);
  627 + @$x_ppi[$page-1] = $xppi;
  628 + @$y_ppi[$page-1] = $yppi;
511 } 629 }
512 } 630 }
513 return $i+1; 631 return $i+1;
@@ -542,6 +660,19 @@ sub get_res { @@ -542,6 +660,19 @@ sub get_res {
542 return ($res_x,$res_y); 660 return ($res_x,$res_y);
543 } 661 }
544 662
  663 +sub get_sign {
  664 + my ($in_file) = @_;
  665 + my @lines = `${PDFSIG} \"${in_file}\" 2>/dev/null`;
  666 +
  667 + foreach (@lines) {
  668 + chomp;
  669 + if ( $_ =~ /^Signature/ ) {
  670 + return 1;
  671 + }
  672 + }
  673 + return 0;
  674 +}
  675 +
545 sub is_locked_ex { 676 sub is_locked_ex {
546 my ($path) = @_; 677 my ($path) = @_;
547 678
workflow.pdf
No preview for this file type
workflow.vsd
No preview for this file type