Commit d2b74559c9a3595de14ad8ce6ba843d92f505213
1 parent
78ec197b
Exists in
master
and in
1 other branch
Pré versão 2.0 a ser liberada - PARA O CANTONI INCLUIR O Container Docker
Showing
5 changed files
with
440 additions
and
271 deletions
Show diff stats
INSTALL.txt
... | ... | @@ -1,202 +0,0 @@ |
1 | -# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees | |
2 | -# | |
3 | -# This script monitors a set of input directories for PDF files | |
4 | -# once a new file is detected, it is processes through tesseract OCR | |
5 | -# in order to generate a new file with a hidden searchable text layer | |
6 | -# | |
7 | -# It may be distributed under the conditions of the LGPL v2.1 license. | |
8 | -# | |
9 | -# Author: Guilherme Chehab | |
10 | -# | |
11 | -# Version History: | |
12 | -# 0.1 Initial single server version | |
13 | -# 0.2 Check if page already has the html hidden layer, if so, ignore it | |
14 | -# 0.3 Solved issues about various image enconding types | |
15 | -# 0.4 Added a postnormalization step to ensure all output pdf pages have | |
16 | -# the same size and orientations as the original files | |
17 | -# 0.5 Used input file renaming as a way to sync multiple parallel instances, | |
18 | -# that way, it is minimized the risk of same file being OCRed multiple times. | |
19 | -# 0.6 Added a default handler for unknown image encoding using jpeg encoding | |
20 | -# 0.7 Solved an issue with files with more than 1000 pages | |
21 | -# 1.0 First release version | |
22 | -# 1.0.1 Solving error when file has no images | |
23 | -# 1.0.2 Fix bug when counting cores for AMD processors | |
24 | -# 1.0.3 Added better image type detection | |
25 | -# 1.0.4 Fix: added ubuntu init script | |
26 | -# 1.0.4b Centos 6.9 | |
27 | -# | |
28 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | |
29 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
30 | -# diferently but does not treat it adequately | |
31 | -# - Review poppler and cpdf install instructions | |
32 | -# - Add better handling of vectorized and non scanned pdf files | |
33 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
34 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
35 | -# | |
36 | -# Check software requirements on the comments bellow | |
37 | -# | |
38 | -# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables | |
39 | -# | |
40 | -# | |
41 | -# O servidor OCR depende dos seguintes componentes: | |
42 | -# - Perl 5.10.1, com seguintes módulos: | |
43 | -# - File::Find::Rule | |
44 | -# - File::Basename | |
45 | -# - File::Copy | |
46 | -# - File::Path | |
47 | -# - File::Touch | |
48 | -# - Sys::Syslog | |
49 | -# - Sys::Hostname | |
50 | -# - IPC::Open3 | |
51 | -# - IO::Select | |
52 | -# - POSIX | |
53 | -# - Tesseract-ocr 3.05, com dicionários inglês e português | |
54 | -# - Pdftk 2.02 | |
55 | -# - Poppler-utils 0.42.0 | |
56 | -# - Cpdf 2.1 | |
57 | -# - ImageMagick 6.7.2-7 | |
58 | -# | |
59 | -# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | |
60 | -# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | |
61 | -# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | |
62 | -# | |
63 | -## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | |
64 | -# | |
65 | -# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | |
66 | -# | |
67 | -# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | |
68 | -# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | |
69 | -# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | |
70 | -# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | |
71 | -# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | |
72 | -# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | |
73 | -# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | |
74 | -# | |
75 | -# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | |
76 | -# | |
77 | -# | |
78 | -# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root) | |
79 | -# | |
80 | -# | |
81 | -# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | |
82 | -# | |
83 | -# RedHat 6.7 e Centos 6.9: | |
84 | -yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | |
85 | -yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel | |
86 | -cd /tmp | |
87 | -wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | |
88 | -rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | |
89 | -rm -f msttcore-fonts-2.0-3.noarch.rpm | |
90 | - | |
91 | -# Centos 6.9 | |
92 | -# \_ autoconf-archive | |
93 | -wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | |
94 | -rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | |
95 | -rm autoconf-archive-2012.04.07-7.3.noarch.rpm | |
96 | -# \_ GCC 4.8 | |
97 | -wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | |
98 | -yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | |
99 | - | |
100 | -# Ubuntu 14.04 Server: | |
101 | -apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | |
102 | -apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev | |
103 | -apt-get install ttf-mscorefonts-installer | |
104 | - | |
105 | -# Ambas plataformas: | |
106 | -cd /usr/local/src | |
107 | - | |
108 | -for i in \ | |
109 | - https://github.com/tesseract-ocr/langdata.git \ | |
110 | - https://github.com/DanBloomberg/leptonica.git \ | |
111 | - https://github.com/libav/libav.git \ | |
112 | - https://github.com/tesseract-ocr/tessdata.git \ | |
113 | - https://github.com/tesseract-ocr/tesseract.git \ | |
114 | - git://git.freedesktop.org/git/poppler/poppler.git \ | |
115 | - git://git.freedesktop.org/git/poppler/test.git \ | |
116 | - https://github.com/Flameeyes/unpaper.git \ | |
117 | - https://github.com/ocaml/ocaml.git \ | |
118 | - https://gitlab.camlcity.org/gerd/lib-findlib.git \ | |
119 | - https://github.com/johnwhitington/camlpdf.git \ | |
120 | - https://github.com/johnwhitington/cpdf-source.git \ | |
121 | -; do git clone $i; done | |
122 | - | |
123 | -wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | |
124 | -unzip pdftk-2.02-src.zip | |
125 | -rm -f pdftk-2.02-src.zip | |
126 | - | |
127 | -# pdftk, versão 2.02 ou superior | |
128 | -cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | |
129 | - | |
130 | -# Centos 6.9 | |
131 | -# \_ Cria um novo shell usando o GCC 4.8 por default | |
132 | -scl enable devtoolset-2 bash | |
133 | - | |
134 | -# Tesseract, versão 3.05-dev ou superior | |
135 | -# Bibliotecas para o Tesseract: Leptonica e Libav | |
136 | -cd leptonica && ./autobuild && ./configure && make all install && cd .. | |
137 | - | |
138 | -# Para compilação do Tesseract após a compilação do leptonica | |
139 | -export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | |
140 | - | |
141 | -cd libav && ./configure --enable-sram && make all install && cd .. | |
142 | - | |
143 | -# Tesseract | |
144 | -cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | |
145 | -cp -avR tessdata/* /usr/local/share/tessdata/ | |
146 | - | |
147 | -# cpdf, versão 2.1 ou superior | |
148 | -cd ocaml && ./configure && make world.opt && make install && cd .. | |
149 | -mkdir -p /usr/local/man/man5 | |
150 | -# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | |
151 | -cd lib-findlib && ./configure && make all && make install && cd .. | |
152 | -cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | |
153 | -cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | |
154 | - | |
155 | -# poppler-utils, versão 0.42.0 ou superior | |
156 | -cd poppler && ./autogen.sh && ./configure && make all install && cd .. | |
157 | - | |
158 | -# Centos 6.9 | |
159 | -# \_ Termina o shell usando o GCC 4.8 por default | |
160 | -exit | |
161 | - | |
162 | -# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root) | |
163 | - | |
164 | -## Comandos adicionais para configuração do módulo: | |
165 | - | |
166 | -# Criação do usuário | |
167 | -adduser ocr | |
168 | - | |
169 | -# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | |
170 | -cp ./usr/local/bin/ocr /usr/local/bin | |
171 | - | |
172 | -# Auto start (RedHat 6.7 e CentOs 6.9) | |
173 | -cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | |
174 | -mv /etc | |
175 | -chkconfig --add ocr | |
176 | -chkconfig --level 2345 ocr on | |
177 | - | |
178 | -# Auto start (Ubuntu 14.04) | |
179 | -cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | |
180 | -update-rd.d ocr defaults | |
181 | - | |
182 | -# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | |
183 | -cd /home/ocr | |
184 | -tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | |
185 | -su | |
186 | - | |
187 | -# Copie o pacote para os outros servidores e extraia com: | |
188 | -cd / | |
189 | -tar xovzf pkg-ocr.tgz | |
190 | - | |
191 | -# Instalando pré-requisitos RUNTIME em servidores adicionais | |
192 | - | |
193 | -# Redhat 6.7 e CentOS 6.9 | |
194 | -yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp | |
195 | -yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | |
196 | - | |
197 | -# Ubuntu 14.04 | |
198 | -apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | |
199 | -apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 | |
200 | - | |
201 | -# Inicie o serviço com | |
202 | -service ocr start |
... | ... | @@ -0,0 +1,240 @@ |
1 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees | |
2 | + | |
3 | +This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer | |
4 | + | |
5 | +It may be distributed under the conditions of the LGPL v2.1 license. | |
6 | + | |
7 | +Author: Guilherme Chehab | |
8 | + | |
9 | +## Version History: | |
10 | + - 0.1 | |
11 | + - Initial single server version | |
12 | + - 0.2 | |
13 | + - Check if page already has the html hidden layer, if so, ignore it | |
14 | + - 0.3 | |
15 | + - Solved issues about various image enconding types | |
16 | + - 0.4 | |
17 | + - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files | |
18 | + - 0.5 | |
19 | + - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times. | |
20 | + - 0.6 | |
21 | + - Added a default handler for unknown image encoding using jpeg encoding | |
22 | + - 0.7 | |
23 | + - Solved an issue with files with more than 1000 pages | |
24 | + - 1.0 | |
25 | + - First release version | |
26 | + - 1.0.1 Solving error when file has no images | |
27 | + - 1.0.2 Fix bug when counting cores for AMD processors | |
28 | + - 1.0.3 Added better image type detection | |
29 | + - 1.0.4 Fix: added ubuntu init script | |
30 | + - 1.0.4b Add Centos 6.9 install instructions | |
31 | + - 2.0 | |
32 | + - PDF/A output, and better compression with ghostscript | |
33 | + - Rewritten image extration, processing and transformations process | |
34 | + - Check if input file is signed, in this case, does not change the file contents | |
35 | + - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | |
36 | + - Use operating system packges by default | |
37 | + - Changed paths from external programs, instead of using full paths, uses first match from $PATH | |
38 | + - Check existence of external programs on path before running | |
39 | + - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | |
40 | + - Fix: create subpaths on error folder | |
41 | + - Fix: trying to reduce overhead on temporary folder | |
42 | + - TODO: | |
43 | + - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | |
44 | + - Review poppler and cpdf install instructions | |
45 | + - Add better handling of vectorized and non scanned pdf files | |
46 | + - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers | |
47 | + - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W. | |
48 | + - Move all parameters to config file | |
49 | + - Add some job control web interface | |
50 | + - Add end user interface to submit files through web | |
51 | + - Add check external programs version requirements before running | |
52 | + - BUGS: | |
53 | + - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages | |
54 | + | |
55 | +## Check software requirements on the comments bellow | |
56 | + | |
57 | +To configure input dirs change @BASE_DIRS and @SUB_DIRS variables | |
58 | + | |
59 | +### O servidor OCR depende dos seguintes componentes: | |
60 | + - Perl 5.10.1, com seguintes módulos: | |
61 | + - File::Find::Rule | |
62 | + - File::Basename | |
63 | + - File::Copy | |
64 | + - File::Path | |
65 | + - File::Touch | |
66 | + - Sys::Syslog | |
67 | + - Sys::Hostname | |
68 | + - IPC::Open3 | |
69 | + - IO::Select | |
70 | + - POSIX | |
71 | + - Tesseract-ocr 3.05, com dicionários inglês e português | |
72 | + - Pdftk 2.02 | |
73 | + - Poppler-utils 0.42.0 | |
74 | + - Cpdf 2.1 | |
75 | + - ImageMagick 6.7.2-7 | |
76 | + - Ghostcript 9.18 | |
77 | + | |
78 | +Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | |
79 | + | |
80 | +Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | |
81 | + | |
82 | +Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | |
83 | + | |
84 | +ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | |
85 | + | |
86 | +### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | |
87 | + | |
88 | +- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | |
89 | +- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | |
90 | +- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | |
91 | +- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | |
92 | + | |
93 | +Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | |
94 | + | |
95 | +Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | |
96 | + | |
97 | +A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | |
98 | + | |
99 | +Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | |
100 | + | |
101 | +# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root) | |
102 | + | |
103 | +Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial) | |
104 | +são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada | |
105 | + | |
106 | +Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries | |
107 | + | |
108 | +## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | |
109 | + | |
110 | +### RedHat 6.7 e Centos 6.9: | |
111 | + yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | |
112 | + yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel | |
113 | + cd /tmp | |
114 | + wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | |
115 | + rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | |
116 | + rm -f msttcore-fonts-2.0-3.noarch.rpm | |
117 | + | |
118 | +### Centos 6.9 | |
119 | +# \_ autoconf-archive | |
120 | + wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | |
121 | + rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | |
122 | + rm autoconf-archive-2012.04.07-7.3.noarch.rpm | |
123 | +# \_ GCC 4.8 | |
124 | + wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | |
125 | + yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | |
126 | + | |
127 | +# Ubuntu 14.04 Server: | |
128 | + apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | |
129 | + apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev | |
130 | + apt-get install ttf-mscorefonts-installer | |
131 | + | |
132 | +# Ambas plataformas: | |
133 | + cd /usr/local/src | |
134 | + | |
135 | + for i in \ | |
136 | + https://github.com/tesseract-ocr/langdata.git \ | |
137 | + https://github.com/DanBloomberg/leptonica.git \ | |
138 | + https://github.com/libav/libav.git \ | |
139 | + https://github.com/tesseract-ocr/tessdata.git \ | |
140 | + https://github.com/tesseract-ocr/tesseract.git \ | |
141 | + git://git.freedesktop.org/git/poppler/poppler.git \ | |
142 | + git://git.freedesktop.org/git/poppler/test.git \ | |
143 | + https://github.com/Flameeyes/unpaper.git \ | |
144 | + https://github.com/ocaml/ocaml.git \ | |
145 | + https://gitlab.camlcity.org/gerd/lib-findlib.git \ | |
146 | + https://github.com/johnwhitington/camlpdf.git \ | |
147 | + https://github.com/johnwhitington/cpdf-source.git \ | |
148 | + http://git.ghostscript.com/ghostpdl.git \ | |
149 | + ; do git clone $i; done | |
150 | + | |
151 | + wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | |
152 | + unzip pdftk-2.02-src.zip | |
153 | + rm -f pdftk-2.02-src.zip | |
154 | + | |
155 | +# pdftk, versão 2.02 ou superior | |
156 | +cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | |
157 | + | |
158 | +# Ghostscript 9.18 ou superior | |
159 | +#wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz | |
160 | +#tar xvozf ghostscript-9.21.tar.gz | |
161 | +#rm -f ghostscript-9.21.tar.gz | |
162 | +#cd ghostscript-9.21 | |
163 | +cd ghostpdl | |
164 | +./autogen.sh; ./configure | |
165 | +make all install | |
166 | +cd .. | |
167 | + | |
168 | +# Centos 6.9 | |
169 | +# \_ Cria um novo shell usando o GCC 4.8 por default | |
170 | +scl enable devtoolset-2 bash | |
171 | + | |
172 | +# Tesseract, versão 3.05-dev ou superior | |
173 | +# Bibliotecas para o Tesseract: Leptonica e Libav | |
174 | +cd leptonica && ./autobuild && ./configure && make all install && cd .. | |
175 | + | |
176 | +# Para compilação do Tesseract após a compilação do leptonica | |
177 | +export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | |
178 | + | |
179 | +cd libav && ./configure --enable-sram && make all install && cd .. | |
180 | + | |
181 | +# Tesseract | |
182 | +cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | |
183 | +cp -avR tessdata/* /usr/local/share/tessdata/ | |
184 | + | |
185 | +# cpdf, versão 2.1 ou superior | |
186 | +cd ocaml && ./configure && make world.opt && make install && cd .. | |
187 | +mkdir -p /usr/local/man/man5 | |
188 | +# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | |
189 | +cd lib-findlib && ./configure && make all && make install && cd .. | |
190 | +cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | |
191 | +cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | |
192 | + | |
193 | +# poppler-utils, versão 0.42.0 ou superior | |
194 | +cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd .. | |
195 | + | |
196 | +# Centos 6.9 | |
197 | +# \_ Termina o shell usando o GCC 4.8 por default | |
198 | +exit | |
199 | + | |
200 | +# ----------------------- INSTALAÇÃO (obs.: os comandos devem ser executados como root) | |
201 | + | |
202 | +## Comandos adicionais para configuração do módulo: | |
203 | + | |
204 | +# Criação do usuário | |
205 | +adduser ocr | |
206 | + | |
207 | +# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | |
208 | +cp ./usr/local/bin/ocr /usr/local/bin | |
209 | + | |
210 | +# Auto start (RedHat 6.7 e CentOs 6.9) | |
211 | +cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | |
212 | +mv /etc | |
213 | +chkconfig --add ocr | |
214 | +chkconfig --level 2345 ocr on | |
215 | + | |
216 | +# Auto start (Ubuntu 14.04) | |
217 | +cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | |
218 | +update-rd.d ocr defaults | |
219 | + | |
220 | +# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | |
221 | +cd /home/ocr | |
222 | +tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | |
223 | +su | |
224 | + | |
225 | +# Copie o pacote para os outros servidores e extraia com: | |
226 | +cd / | |
227 | +tar xovzf pkg-ocr.tgz | |
228 | + | |
229 | +# Instalando pré-requisitos RUNTIME em servidores adicionais | |
230 | + | |
231 | +# Redhat 6.7 e CentOS 6.9 | |
232 | +yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript | |
233 | +yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | |
234 | + | |
235 | +# Ubuntu 14.04 | |
236 | +apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | |
237 | +apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript | |
238 | + | |
239 | +# Inicie o serviço com | |
240 | +service ocr start | ... | ... |
usr/local/bin/ocr
1 | -#! /usr/bin/perl -w | |
1 | +#!/usr/bin/perl -w | |
2 | 2 | # |
3 | -# OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes | |
3 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes | |
4 | 4 | # |
5 | 5 | # This script monitors a set of input directories for PDF files |
6 | 6 | # once a new file is detected, it is processes through tesseract OCR |
... | ... | @@ -24,15 +24,38 @@ |
24 | 24 | # 1.0.1 Solving error when file has no images |
25 | 25 | # 1.0.2 Fix bug when counting cores for AMD processors |
26 | 26 | # 1.0.3 Added better image type detection |
27 | -# 1.0.4 Fix: added ubuntu init script | |
27 | +# 1.0.4 Fix: added ubuntu init script | |
28 | +# 1.0.4b Add Centos 6.9 install instructions | |
29 | +# 2.0 PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is | |
30 | +# strongly recomended | |
31 | +# Rewritten image extration, processing and transformations process | |
32 | +# Check if input file is signed, in this case, does not change the file contents | |
33 | +# Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | |
34 | +# Use operating system packges by default | |
35 | +# Changed paths from external programs, instead of using full paths, uses first match from $PATH | |
36 | +# Check existence of external programs on path before running | |
37 | +# Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | |
38 | +# Fix: create subpaths on error folder | |
39 | +# Fix: trying to reduce overhead on temporary folder | |
28 | 40 | # |
29 | 41 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
30 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
31 | -# diferently but does not treat it adequately | |
42 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them | |
43 | +# diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | |
32 | 44 | # - Review poppler and cpdf install instructions |
33 | 45 | # - Add better handling of vectorized and non scanned pdf files |
34 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
35 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
46 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current | |
47 | +# scalling, cropping and rotation handlers | |
48 | +# - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- | |
49 | +# added function to analyse image color histogram -> just need to add option to convert it to B&W. | |
50 | +# - Move all parameters to config file | |
51 | +# - Add some job control web interface | |
52 | +# - Add end user interface to submit files through web | |
53 | +# - Add check external programs version requirements before running | |
54 | +# | |
55 | +# BUGS: - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than | |
56 | +# original, this is due to using pdftoppm instead of pdfimages | |
57 | +# - Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions | |
58 | +# increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server | |
36 | 59 | # |
37 | 60 | # Check software requirements on the comments bellow |
38 | 61 | # |
... | ... | @@ -54,8 +77,8 @@ use Sys::Hostname; |
54 | 77 | use IPC::Open3; |
55 | 78 | use IO::Select; |
56 | 79 | |
57 | -my $DEBUG = 0; | |
58 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | |
80 | +my $DEBUG = 2; | |
81 | +my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | |
59 | 82 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
60 | 83 | |
61 | 84 | my $USER = 'ocr'; |
... | ... | @@ -63,23 +86,28 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca |
63 | 86 | |
64 | 87 | # Command dependencies |
65 | 88 | |
66 | -# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher | |
67 | -my $TESSERACT = '/usr/local/bin/tesseract -l por+eng'; | |
89 | +# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended | |
90 | +my $TESSERACT = 'tesseract --oem 0'; # if Tesseract => 4.0 | |
91 | +#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0 | |
68 | 92 | |
69 | 93 | # Depends on pdftk 2.02 or higher |
70 | -my $PDFTK = '/usr/local/bin/pdftk'; | |
94 | +my $PDFTK = 'pdftk'; | |
71 | 95 | |
72 | 96 | # Depends on poppler-utils 0.42.0 or higher |
73 | -#my $PDINFO = '/usr/local/bin/pdfinfo'; | |
74 | -my $PDFFONTS = '/usr/local/bin/pdffonts'; | |
75 | -my $PDFIMAGES = '/usr/local/bin/pdfimages'; | |
76 | -my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | |
97 | +my $PDFFONTS = 'pdffonts'; | |
98 | +my $PDFIMAGES = 'pdfimages'; | |
99 | +my $PDFTOPPM = 'pdftoppm'; | |
100 | +my $PDFUNITE = 'pdfunite'; | |
101 | +my $PDFSIG = 'pdfsig'; | |
77 | 102 | |
78 | 103 | # Depends on cpdf 2.1 or higher |
79 | -my $CPDF = '/usr/local/bin/cpdf'; | |
104 | +my $CPDF = 'cpdf'; | |
105 | + | |
106 | +# Depends on Ghostscript 9.18 | |
107 | +my $GS = 'gs'; | |
80 | 108 | |
81 | 109 | ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner |
82 | -my $CONVERT = '/usr/bin/convert'; | |
110 | +my $CONVERT = 'convert'; | |
83 | 111 | |
84 | 112 | # If it is needed further filtering |
85 | 113 | #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; |
... | ... | @@ -90,12 +118,14 @@ my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', |
90 | 118 | my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); |
91 | 119 | |
92 | 120 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
93 | -%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); | |
121 | +%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2); | |
94 | 122 | |
95 | 123 | # Safeguard im case of cpuinfo has not identified correctly the number of CPUs |
96 | 124 | $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; |
97 | 125 | |
98 | -$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; | |
126 | +$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin'; | |
127 | +$ENV{'IFS'} = '\t\n'; | |
128 | + | |
99 | 129 | my ($host) = split/\./,hostname; |
100 | 130 | |
101 | 131 | use vars qw/*name *dir *prune/; |
... | ... | @@ -107,14 +137,15 @@ sub main; |
107 | 137 | sub get_pages; |
108 | 138 | sub get_rotation; |
109 | 139 | sub get_res; |
110 | -sub is_ocred; | |
111 | 140 | sub is_locked_ex; |
112 | 141 | |
113 | 142 | |
114 | 143 | my $expr = 'use POSIX qw(setsid)'; |
115 | 144 | |
116 | 145 | my ($dumb1, $dumb2, $uid) = getpwnam ($USER); |
117 | -setuid ($uid) or warn "Cant set uid $uid"; | |
146 | +if (defined $uid) { | |
147 | + setuid ($uid) or warn "Cant set uid $uid"; | |
148 | +} | |
118 | 149 | |
119 | 150 | $SIG{__DIE__} = 'DEFAULT'; |
120 | 151 | $SIG{__WARN__} = \&die_when_called; |
... | ... | @@ -126,6 +157,11 @@ if ($@) { |
126 | 157 | chdir('/') or die "$0: cannot chdir '/': $!\n"; |
127 | 158 | open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; |
128 | 159 | |
160 | +foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) { | |
161 | + die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0); | |
162 | +} | |
163 | + | |
164 | + | |
129 | 165 | foreach my $DIR (@BASE_DIRS) { |
130 | 166 | |
131 | 167 | defined(my $pid = fork) or die "$0: cannot fork: $!\n"; |
... | ... | @@ -135,7 +171,7 @@ foreach my $DIR (@BASE_DIRS) { |
135 | 171 | main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); |
136 | 172 | exit 0; |
137 | 173 | last; |
138 | - } | |
174 | + } | |
139 | 175 | } |
140 | 176 | |
141 | 177 | exit 0; |
... | ... | @@ -157,7 +193,7 @@ sub main { |
157 | 193 | # remove .tmp file |
158 | 194 | unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); |
159 | 195 | |
160 | - # Rename files that were in 'processig' back | |
196 | + # Rename files that were in 'processing' state back | |
161 | 197 | foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { |
162 | 198 | my $old_name = $file; |
163 | 199 | $old_name =~ s/\.${host}\.processing$//g; |
... | ... | @@ -177,12 +213,14 @@ sub main { |
177 | 213 | # Main loop |
178 | 214 | while ( 1 ) { |
179 | 215 | select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced |
216 | + | |
180 | 217 | $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); |
181 | 218 | print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; |
182 | 219 | $count = scalar keys %files_in; |
183 | - foreach my $file (keys %files_in) { | |
184 | 220 | |
185 | - next if ( glob ("$file.*.tmp")); | |
221 | + foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) { | |
222 | + | |
223 | + next if ( glob ("\"$file.*.tmp\"")); | |
186 | 224 | |
187 | 225 | select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds |
188 | 226 | next if (!defined $files_in{$file}); # continue only if it is still valid |
... | ... | @@ -255,7 +293,7 @@ sub ocr { |
255 | 293 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
256 | 294 | unlink ("$in_file.$host.tmp"); |
257 | 295 | move ( "$in_file.$host.processing", $in_file); |
258 | - exit 0; | |
296 | + exit 1; | |
259 | 297 | }; |
260 | 298 | |
261 | 299 | my $out_path = $in_path; |
... | ... | @@ -271,7 +309,7 @@ sub ocr { |
271 | 309 | my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); |
272 | 310 | |
273 | 311 | print "\twritting to $out_file\n" if $DEBUG; |
274 | - | |
312 | + | |
275 | 313 | my $stime = time; |
276 | 314 | my %pids; |
277 | 315 | |
... | ... | @@ -291,8 +329,26 @@ sub ocr { |
291 | 329 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
292 | 330 | unlink ("$in_file.$host.tmp"); |
293 | 331 | move ( "$in_file.$host.processing", $in_file); |
332 | + print "Error: cannot copy $in_file to temp dir \n" if $DEBUG; | |
333 | + syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG; | |
334 | + exit 1; | |
294 | 335 | }; |
295 | 336 | |
337 | + # Check if file was signed | |
338 | + if (get_sign($tmp_file)) { | |
339 | + if (!copy ("$in_file.$host.processing", $proc_file)) { | |
340 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | |
341 | + unlink ("$in_file.$host.tmp"); | |
342 | + move ( "$in_file.$host.processing", $in_file); | |
343 | + }; | |
344 | + move ("$in_file.$host.processing", $out_file); | |
345 | + unlink ("$in_file.$host.tmp"); | |
346 | + print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG; | |
347 | + syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG; | |
348 | + | |
349 | + exit 0; | |
350 | + } | |
351 | + | |
296 | 352 | # Extract pages |
297 | 353 | ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); |
298 | 354 | if ($DEBUG) { |
... | ... | @@ -301,12 +357,13 @@ sub ocr { |
301 | 357 | print "\t\t\t$_" for @err ; |
302 | 358 | }; |
303 | 359 | |
360 | + my ($pages, @pg_w, @pg_h, @pg_r, @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2); | |
361 | + $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2); | |
304 | 362 | |
305 | - my ($pages, @pg_w, @pg_h, @pg_r); | |
306 | - $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r); | |
363 | + my ($imgs,@page_img, @img_w, @img_h, @img_t, @img_xppi, @img_yppi); | |
364 | + $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi); | |
307 | 365 | |
308 | - my ($imgs,@page_img, @img_w, @img_h, @img_t); | |
309 | - $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t); | |
366 | + unlink ($tmp_file) if (!$DEBUG); | |
310 | 367 | |
311 | 368 | for ( my $i=0; $i< $pages; $i++ ) { |
312 | 369 | my $pg = sprintf ("pg_%06d", $i+1); |
... | ... | @@ -333,25 +390,29 @@ sub ocr { |
333 | 390 | if (! defined $img_t[$i] ) { |
334 | 391 | move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); |
335 | 392 | print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; |
336 | - exit 0; | |
393 | + exit -1; | |
337 | 394 | } |
338 | 395 | |
339 | - print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG; | |
396 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG; | |
397 | + print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG); | |
398 | + print "\n" if ($DEBUG); | |
340 | 399 | |
400 | + # Extract images from page, since 2.0 uses png lossless format regardless of original format or depth | |
341 | 401 | undef $cmd; |
342 | 402 | |
343 | - if ($img_t[$i] eq "gray") { | |
344 | - $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
403 | + # Use PDFIMAGES and JPEG by default | |
404 | + $cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
405 | + | |
406 | + if ($img_t[$i] eq "stencil") { | |
407 | + $cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
345 | 408 | } |
346 | 409 | |
347 | - if ($img_t[$i] eq "rgb") { | |
348 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
349 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | |
410 | + if ($img_t[$i] eq "gray") { | |
411 | + $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
350 | 412 | } |
351 | 413 | |
352 | - if (!defined $cmd) { | |
353 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
354 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | |
414 | + if ($img_t[$i] !~ /gray|rgb|stencil/) { | |
415 | + $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
355 | 416 | } |
356 | 417 | |
357 | 418 | ($exit,$cmd,@out,@err) = exec_cmd($cmd); |
... | ... | @@ -362,7 +423,13 @@ sub ocr { |
362 | 423 | }; |
363 | 424 | |
364 | 425 | # Process each resulting image for page pdf |
365 | - my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ; | |
426 | + my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ; | |
427 | + | |
428 | + if (scalar @images == 0) { | |
429 | + move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); | |
430 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG; | |
431 | + exit 0; | |
432 | + } | |
366 | 433 | |
367 | 434 | foreach my $image (@images) { |
368 | 435 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
... | ... | @@ -378,43 +445,65 @@ sub ocr { |
378 | 445 | print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; |
379 | 446 | } |
380 | 447 | } |
381 | - | |
382 | - # Check if page was rotated | |
383 | - if ($pg_r[$i]) { | |
384 | - print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG; | |
385 | - ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\""); | |
448 | + | |
449 | + # Check if page was rotated and extracted with pdftoppm | |
450 | + if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) { | |
451 | + print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG; | |
452 | + ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\""); | |
386 | 453 | if ($DEBUG) { |
387 | 454 | print "\t\t\t${image} -> $cmd: $exit\n"; |
388 | 455 | print "\t\t\t\t$_" for @out ; |
389 | 456 | print "\t\t\t\t$_" for @err ; |
390 | 457 | }; |
391 | 458 | } |
392 | - | |
459 | + | |
393 | 460 | # Filter ppm images, if needed |
394 | 461 | |
395 | 462 | # OCR ppm images to pdf pages |
396 | - ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf"); | |
463 | + ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf"); | |
397 | 464 | if ($DEBUG) { |
398 | 465 | print "\t\t\t${image} -> $cmd: $exit\n"; |
399 | 466 | print "\t\t\t\t$_" for @out ; |
400 | 467 | print "\t\t\t\t$_" for @err ; |
401 | 468 | }; |
469 | + unlink ("$image") if (!$DEBUG); | |
402 | 470 | |
403 | - # Scale to fit pdf | |
404 | - ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | |
471 | + # Scale, crop and rotate to fit pdf | |
472 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | |
405 | 473 | if ($DEBUG) { |
406 | 474 | print "\t\t\t${image} -> $cmd: $exit\n"; |
407 | 475 | print "\t\t\t\t$_" for @out ; |
408 | 476 | print "\t\t\t\t$_" for @err ; |
409 | 477 | }; |
478 | + unlink ("$image.pdf") if (!$DEBUG); | |
410 | 479 | |
480 | + if (defined $pg_crop_x1[$i]) { | |
481 | + # adjust cropbox | |
482 | + ($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = ( | |
483 | + ($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]), | |
484 | + ($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]), | |
485 | + abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i]) | |
486 | + ); | |
487 | + | |
488 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | |
489 | + if ($DEBUG) { | |
490 | + print "\t\t\t${image} -> $cmd: $exit\n"; | |
491 | + print "\t\t\t\t$_" for @out ; | |
492 | + print "\t\t\t\t$_" for @err ; | |
493 | + }; | |
494 | + } | |
495 | + | |
496 | + if ($pg_r[$i]) { | |
497 | + ($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | |
498 | + if ($DEBUG) { | |
499 | + print "\t\t\t${image} -> $cmd: $exit\n"; | |
500 | + print "\t\t\t\t$_" for @out ; | |
501 | + print "\t\t\t\t$_" for @err ; | |
502 | + }; | |
503 | + } | |
411 | 504 | |
412 | - unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG); | |
413 | - unlink ("$image.pdf") if (!$DEBUG); | |
414 | - move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG); | |
415 | - unlink ("$image") if (!$DEBUG); | |
416 | 505 | } |
417 | - exit 0; | |
506 | + exit 1; | |
418 | 507 | } |
419 | 508 | } |
420 | 509 | |
... | ... | @@ -427,28 +516,51 @@ sub ocr { |
427 | 516 | |
428 | 517 | if (scalar @new_pages != $pages) { |
429 | 518 | print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); |
430 | - syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG); | |
519 | + syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG); | |
431 | 520 | unlink "$in_file.$host.tmp"; |
521 | + make_path ($error_path) if ( ! -d $error_path); | |
432 | 522 | move ("$in_file.$host.processing", $error_file); |
433 | - exit (0); | |
523 | + exit (1); | |
434 | 524 | } |
435 | 525 | |
436 | - # Merge resulting pdf pages to a single pdf | |
526 | + # Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output | |
437 | 527 | make_path ($out_path) if ( ! -d $out_path); |
438 | 528 | unlink $out_file if ( -f $out_file ); |
439 | - ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress"); | |
529 | + | |
530 | + chdir (${tmpdir}); | |
531 | + ($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\" pg_*-cpdf.pdf "); | |
440 | 532 | if ($DEBUG) { |
441 | 533 | print "\t\t${out_file} -> $cmd: $exit\n"; |
442 | 534 | print "\t\t\t$_" for @out ; |
443 | 535 | print "\t\t\t$_" for @err ; |
444 | 536 | }; |
537 | + if ($exit) { | |
538 | + unlink "$in_file.$host.tmp"; | |
539 | + unlink $out_file; | |
540 | + make_path ($error_path) if ( ! -d $error_path); | |
541 | + move ("$in_file.$host.processing", $error_file); | |
542 | + print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); | |
543 | + syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG); | |
544 | + exit (1); | |
545 | + } | |
546 | + chdir ("/"); | |
547 | + | |
548 | + if (!copy (${tmp_file}, $out_file)) { | |
549 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | |
550 | + unlink ("$in_file.$host.tmp"); | |
551 | + unlink $out_file; | |
552 | + make_path ($error_path) if ( ! -d $error_path); | |
553 | + move ("$in_file.$host.processing", $error_file); | |
554 | + print "Error: cannot copy temp file to $out_file \n" if $DEBUG; | |
555 | + syslog ("error","cannot copy temp file to $out_file") if !$DEBUG; | |
556 | + exit 1; | |
557 | + }; | |
445 | 558 | |
446 | 559 | make_path ($proc_path) if ( ! -d $proc_path); |
447 | 560 | unlink $proc_file if ( -f $proc_file ); |
448 | 561 | move ("$in_file.$host.processing", $proc_file); |
449 | 562 | move ("${out_file}.tmp", ${out_file}); |
450 | 563 | |
451 | - | |
452 | 564 | # Remove temp dir |
453 | 565 | remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); |
454 | 566 | unlink $tmp_file if (!$DEBUG); |
... | ... | @@ -471,7 +583,7 @@ sub is_ocred { |
471 | 583 | } |
472 | 584 | |
473 | 585 | sub get_pages { |
474 | - my ($in_file, $w, $h, $r) = @_; | |
586 | + my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_; | |
475 | 587 | |
476 | 588 | my $pages=0; |
477 | 589 | my $i=0; |
... | ... | @@ -485,29 +597,35 @@ sub get_pages { |
485 | 597 | ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); |
486 | 598 | ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); |
487 | 599 | ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); |
488 | - ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); | |
600 | + ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); | |
601 | + ($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ ); | |
489 | 602 | } |
490 | 603 | |
491 | 604 | return $pages; |
492 | 605 | } |
493 | 606 | |
494 | 607 | sub get_imgs { |
495 | - my ($in_file, $page_img, $w, $h, $t) = @_; | |
496 | - my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); | |
608 | + my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_; | |
609 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi ); | |
497 | 610 | |
498 | 611 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
612 | + $i = 0; | |
499 | 613 | |
500 | 614 | foreach my $line (@lines) { |
501 | 615 | chomp $line; |
502 | 616 | $line =~ s/^ {1,}//; |
503 | - if ( $line =~ /image|mask/ ) { | |
504 | - ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; | |
617 | + if ( $line !~ /^page|^----/ ) { | |
618 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line; | |
505 | 619 | @$page_img[$page-1]=$i; |
506 | 620 | @$w[$page-1] = $width; |
507 | 621 | @$h[$page-1] = $height; |
508 | 622 | @$t[$page-1] = "rgb"; # Default is color |
509 | - @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | |
510 | 623 | @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); |
624 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | |
625 | + @$t[$page-1] = ( $type eq "stencil" ? $type : @$t[$page-1]); | |
626 | + @$t[$page-1] = ( $enc eq "image" ? $enc : @$t[$page-1]); | |
627 | + @$x_ppi[$page-1] = $xppi; | |
628 | + @$y_ppi[$page-1] = $yppi; | |
511 | 629 | } |
512 | 630 | } |
513 | 631 | return $i+1; |
... | ... | @@ -542,6 +660,19 @@ sub get_res { |
542 | 660 | return ($res_x,$res_y); |
543 | 661 | } |
544 | 662 | |
663 | +sub get_sign { | |
664 | + my ($in_file) = @_; | |
665 | + my @lines = `${PDFSIG} \"${in_file}\" 2>/dev/null`; | |
666 | + | |
667 | + foreach (@lines) { | |
668 | + chomp; | |
669 | + if ( $_ =~ /^Signature/ ) { | |
670 | + return 1; | |
671 | + } | |
672 | + } | |
673 | + return 0; | |
674 | +} | |
675 | + | |
545 | 676 | sub is_locked_ex { |
546 | 677 | my ($path) = @_; |
547 | 678 | ... | ... |
workflow.pdf
No preview for this file type
workflow.vsd
No preview for this file type