Commit d2b74559c9a3595de14ad8ce6ba843d92f505213
1 parent
78ec197b
Exists in
master
and in
1 other branch
Pré versão 2.0 a ser liberada - PARA O CANTONI INCLUIR O Container Docker
Showing
5 changed files
with
440 additions
and
271 deletions
Show diff stats
INSTALL.txt
@@ -1,202 +0,0 @@ | @@ -1,202 +0,0 @@ | ||
1 | -# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees | ||
2 | -# | ||
3 | -# This script monitors a set of input directories for PDF files | ||
4 | -# once a new file is detected, it is processes through tesseract OCR | ||
5 | -# in order to generate a new file with a hidden searchable text layer | ||
6 | -# | ||
7 | -# It may be distributed under the conditions of the LGPL v2.1 license. | ||
8 | -# | ||
9 | -# Author: Guilherme Chehab | ||
10 | -# | ||
11 | -# Version History: | ||
12 | -# 0.1 Initial single server version | ||
13 | -# 0.2 Check if page already has the html hidden layer, if so, ignore it | ||
14 | -# 0.3 Solved issues about various image enconding types | ||
15 | -# 0.4 Added a postnormalization step to ensure all output pdf pages have | ||
16 | -# the same size and orientations as the original files | ||
17 | -# 0.5 Used input file renaming as a way to sync multiple parallel instances, | ||
18 | -# that way, it is minimized the risk of same file being OCRed multiple times. | ||
19 | -# 0.6 Added a default handler for unknown image encoding using jpeg encoding | ||
20 | -# 0.7 Solved an issue with files with more than 1000 pages | ||
21 | -# 1.0 First release version | ||
22 | -# 1.0.1 Solving error when file has no images | ||
23 | -# 1.0.2 Fix bug when counting cores for AMD processors | ||
24 | -# 1.0.3 Added better image type detection | ||
25 | -# 1.0.4 Fix: added ubuntu init script | ||
26 | -# 1.0.4b Centos 6.9 | ||
27 | -# | ||
28 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | ||
29 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | ||
30 | -# diferently but does not treat it adequately | ||
31 | -# - Review poppler and cpdf install instructions | ||
32 | -# - Add better handling of vectorized and non scanned pdf files | ||
33 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | ||
34 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | ||
35 | -# | ||
36 | -# Check software requirements on the comments bellow | ||
37 | -# | ||
38 | -# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables | ||
39 | -# | ||
40 | -# | ||
41 | -# O servidor OCR depende dos seguintes componentes: | ||
42 | -# - Perl 5.10.1, com seguintes módulos: | ||
43 | -# - File::Find::Rule | ||
44 | -# - File::Basename | ||
45 | -# - File::Copy | ||
46 | -# - File::Path | ||
47 | -# - File::Touch | ||
48 | -# - Sys::Syslog | ||
49 | -# - Sys::Hostname | ||
50 | -# - IPC::Open3 | ||
51 | -# - IO::Select | ||
52 | -# - POSIX | ||
53 | -# - Tesseract-ocr 3.05, com dicionários inglês e português | ||
54 | -# - Pdftk 2.02 | ||
55 | -# - Poppler-utils 0.42.0 | ||
56 | -# - Cpdf 2.1 | ||
57 | -# - ImageMagick 6.7.2-7 | ||
58 | -# | ||
59 | -# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | ||
60 | -# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | ||
61 | -# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | ||
62 | -# | ||
63 | -## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | ||
64 | -# | ||
65 | -# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | ||
66 | -# | ||
67 | -# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | ||
68 | -# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | ||
69 | -# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | ||
70 | -# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | ||
71 | -# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | ||
72 | -# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | ||
73 | -# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | ||
74 | -# | ||
75 | -# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | ||
76 | -# | ||
77 | -# | ||
78 | -# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root) | ||
79 | -# | ||
80 | -# | ||
81 | -# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | ||
82 | -# | ||
83 | -# RedHat 6.7 e Centos 6.9: | ||
84 | -yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | ||
85 | -yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel | ||
86 | -cd /tmp | ||
87 | -wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | ||
88 | -rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | ||
89 | -rm -f msttcore-fonts-2.0-3.noarch.rpm | ||
90 | - | ||
91 | -# Centos 6.9 | ||
92 | -# \_ autoconf-archive | ||
93 | -wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
94 | -rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
95 | -rm autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
96 | -# \_ GCC 4.8 | ||
97 | -wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | ||
98 | -yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | ||
99 | - | ||
100 | -# Ubuntu 14.04 Server: | ||
101 | -apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | ||
102 | -apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev | ||
103 | -apt-get install ttf-mscorefonts-installer | ||
104 | - | ||
105 | -# Ambas plataformas: | ||
106 | -cd /usr/local/src | ||
107 | - | ||
108 | -for i in \ | ||
109 | - https://github.com/tesseract-ocr/langdata.git \ | ||
110 | - https://github.com/DanBloomberg/leptonica.git \ | ||
111 | - https://github.com/libav/libav.git \ | ||
112 | - https://github.com/tesseract-ocr/tessdata.git \ | ||
113 | - https://github.com/tesseract-ocr/tesseract.git \ | ||
114 | - git://git.freedesktop.org/git/poppler/poppler.git \ | ||
115 | - git://git.freedesktop.org/git/poppler/test.git \ | ||
116 | - https://github.com/Flameeyes/unpaper.git \ | ||
117 | - https://github.com/ocaml/ocaml.git \ | ||
118 | - https://gitlab.camlcity.org/gerd/lib-findlib.git \ | ||
119 | - https://github.com/johnwhitington/camlpdf.git \ | ||
120 | - https://github.com/johnwhitington/cpdf-source.git \ | ||
121 | -; do git clone $i; done | ||
122 | - | ||
123 | -wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | ||
124 | -unzip pdftk-2.02-src.zip | ||
125 | -rm -f pdftk-2.02-src.zip | ||
126 | - | ||
127 | -# pdftk, versão 2.02 ou superior | ||
128 | -cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | ||
129 | - | ||
130 | -# Centos 6.9 | ||
131 | -# \_ Cria um novo shell usando o GCC 4.8 por default | ||
132 | -scl enable devtoolset-2 bash | ||
133 | - | ||
134 | -# Tesseract, versão 3.05-dev ou superior | ||
135 | -# Bibliotecas para o Tesseract: Leptonica e Libav | ||
136 | -cd leptonica && ./autobuild && ./configure && make all install && cd .. | ||
137 | - | ||
138 | -# Para compilação do Tesseract após a compilação do leptonica | ||
139 | -export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | ||
140 | - | ||
141 | -cd libav && ./configure --enable-sram && make all install && cd .. | ||
142 | - | ||
143 | -# Tesseract | ||
144 | -cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | ||
145 | -cp -avR tessdata/* /usr/local/share/tessdata/ | ||
146 | - | ||
147 | -# cpdf, versão 2.1 ou superior | ||
148 | -cd ocaml && ./configure && make world.opt && make install && cd .. | ||
149 | -mkdir -p /usr/local/man/man5 | ||
150 | -# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | ||
151 | -cd lib-findlib && ./configure && make all && make install && cd .. | ||
152 | -cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | ||
153 | -cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | ||
154 | - | ||
155 | -# poppler-utils, versão 0.42.0 ou superior | ||
156 | -cd poppler && ./autogen.sh && ./configure && make all install && cd .. | ||
157 | - | ||
158 | -# Centos 6.9 | ||
159 | -# \_ Termina o shell usando o GCC 4.8 por default | ||
160 | -exit | ||
161 | - | ||
162 | -# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root) | ||
163 | - | ||
164 | -## Comandos adicionais para configuração do módulo: | ||
165 | - | ||
166 | -# Criação do usuário | ||
167 | -adduser ocr | ||
168 | - | ||
169 | -# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | ||
170 | -cp ./usr/local/bin/ocr /usr/local/bin | ||
171 | - | ||
172 | -# Auto start (RedHat 6.7 e CentOs 6.9) | ||
173 | -cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | ||
174 | -mv /etc | ||
175 | -chkconfig --add ocr | ||
176 | -chkconfig --level 2345 ocr on | ||
177 | - | ||
178 | -# Auto start (Ubuntu 14.04) | ||
179 | -cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | ||
180 | -update-rd.d ocr defaults | ||
181 | - | ||
182 | -# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | ||
183 | -cd /home/ocr | ||
184 | -tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | ||
185 | -su | ||
186 | - | ||
187 | -# Copie o pacote para os outros servidores e extraia com: | ||
188 | -cd / | ||
189 | -tar xovzf pkg-ocr.tgz | ||
190 | - | ||
191 | -# Instalando pré-requisitos RUNTIME em servidores adicionais | ||
192 | - | ||
193 | -# Redhat 6.7 e CentOS 6.9 | ||
194 | -yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp | ||
195 | -yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | ||
196 | - | ||
197 | -# Ubuntu 14.04 | ||
198 | -apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | ||
199 | -apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 | ||
200 | - | ||
201 | -# Inicie o serviço com | ||
202 | -service ocr start |
@@ -0,0 +1,240 @@ | @@ -0,0 +1,240 @@ | ||
1 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees | ||
2 | + | ||
3 | +This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer | ||
4 | + | ||
5 | +It may be distributed under the conditions of the LGPL v2.1 license. | ||
6 | + | ||
7 | +Author: Guilherme Chehab | ||
8 | + | ||
9 | +## Version History: | ||
10 | + - 0.1 | ||
11 | + - Initial single server version | ||
12 | + - 0.2 | ||
13 | + - Check if page already has the html hidden layer, if so, ignore it | ||
14 | + - 0.3 | ||
15 | + - Solved issues about various image enconding types | ||
16 | + - 0.4 | ||
17 | + - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files | ||
18 | + - 0.5 | ||
19 | + - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times. | ||
20 | + - 0.6 | ||
21 | + - Added a default handler for unknown image encoding using jpeg encoding | ||
22 | + - 0.7 | ||
23 | + - Solved an issue with files with more than 1000 pages | ||
24 | + - 1.0 | ||
25 | + - First release version | ||
26 | + - 1.0.1 Solving error when file has no images | ||
27 | + - 1.0.2 Fix bug when counting cores for AMD processors | ||
28 | + - 1.0.3 Added better image type detection | ||
29 | + - 1.0.4 Fix: added ubuntu init script | ||
30 | + - 1.0.4b Add Centos 6.9 install instructions | ||
31 | + - 2.0 | ||
32 | + - PDF/A output, and better compression with ghostscript | ||
33 | + - Rewritten image extration, processing and transformations process | ||
34 | + - Check if input file is signed, in this case, does not change the file contents | ||
35 | + - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | ||
36 | + - Use operating system packges by default | ||
37 | + - Changed paths from external programs, instead of using full paths, uses first match from $PATH | ||
38 | + - Check existence of external programs on path before running | ||
39 | + - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | ||
40 | + - Fix: create subpaths on error folder | ||
41 | + - Fix: trying to reduce overhead on temporary folder | ||
42 | + - TODO: | ||
43 | + - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | ||
44 | + - Review poppler and cpdf install instructions | ||
45 | + - Add better handling of vectorized and non scanned pdf files | ||
46 | + - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers | ||
47 | + - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W. | ||
48 | + - Move all parameters to config file | ||
49 | + - Add some job control web interface | ||
50 | + - Add end user interface to submit files through web | ||
51 | + - Add check external programs version requirements before running | ||
52 | + - BUGS: | ||
53 | + - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages | ||
54 | + | ||
55 | +## Check software requirements on the comments bellow | ||
56 | + | ||
57 | +To configure input dirs change @BASE_DIRS and @SUB_DIRS variables | ||
58 | + | ||
59 | +### O servidor OCR depende dos seguintes componentes: | ||
60 | + - Perl 5.10.1, com seguintes módulos: | ||
61 | + - File::Find::Rule | ||
62 | + - File::Basename | ||
63 | + - File::Copy | ||
64 | + - File::Path | ||
65 | + - File::Touch | ||
66 | + - Sys::Syslog | ||
67 | + - Sys::Hostname | ||
68 | + - IPC::Open3 | ||
69 | + - IO::Select | ||
70 | + - POSIX | ||
71 | + - Tesseract-ocr 3.05, com dicionários inglês e português | ||
72 | + - Pdftk 2.02 | ||
73 | + - Poppler-utils 0.42.0 | ||
74 | + - Cpdf 2.1 | ||
75 | + - ImageMagick 6.7.2-7 | ||
76 | + - Ghostcript 9.18 | ||
77 | + | ||
78 | +Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | ||
79 | + | ||
80 | +Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | ||
81 | + | ||
82 | +Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | ||
83 | + | ||
84 | +ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | ||
85 | + | ||
86 | +### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | ||
87 | + | ||
88 | +- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | ||
89 | +- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | ||
90 | +- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | ||
91 | +- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | ||
92 | + | ||
93 | +Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | ||
94 | + | ||
95 | +Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | ||
96 | + | ||
97 | +A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | ||
98 | + | ||
99 | +Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | ||
100 | + | ||
101 | +# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root) | ||
102 | + | ||
103 | +Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial) | ||
104 | +são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada | ||
105 | + | ||
106 | +Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries | ||
107 | + | ||
108 | +## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | ||
109 | + | ||
110 | +### RedHat 6.7 e Centos 6.9: | ||
111 | + yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | ||
112 | + yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel | ||
113 | + cd /tmp | ||
114 | + wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | ||
115 | + rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | ||
116 | + rm -f msttcore-fonts-2.0-3.noarch.rpm | ||
117 | + | ||
118 | +### Centos 6.9 | ||
119 | +# \_ autoconf-archive | ||
120 | + wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
121 | + rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
122 | + rm autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
123 | +# \_ GCC 4.8 | ||
124 | + wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | ||
125 | + yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | ||
126 | + | ||
127 | +# Ubuntu 14.04 Server: | ||
128 | + apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | ||
129 | + apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev | ||
130 | + apt-get install ttf-mscorefonts-installer | ||
131 | + | ||
132 | +# Ambas plataformas: | ||
133 | + cd /usr/local/src | ||
134 | + | ||
135 | + for i in \ | ||
136 | + https://github.com/tesseract-ocr/langdata.git \ | ||
137 | + https://github.com/DanBloomberg/leptonica.git \ | ||
138 | + https://github.com/libav/libav.git \ | ||
139 | + https://github.com/tesseract-ocr/tessdata.git \ | ||
140 | + https://github.com/tesseract-ocr/tesseract.git \ | ||
141 | + git://git.freedesktop.org/git/poppler/poppler.git \ | ||
142 | + git://git.freedesktop.org/git/poppler/test.git \ | ||
143 | + https://github.com/Flameeyes/unpaper.git \ | ||
144 | + https://github.com/ocaml/ocaml.git \ | ||
145 | + https://gitlab.camlcity.org/gerd/lib-findlib.git \ | ||
146 | + https://github.com/johnwhitington/camlpdf.git \ | ||
147 | + https://github.com/johnwhitington/cpdf-source.git \ | ||
148 | + http://git.ghostscript.com/ghostpdl.git \ | ||
149 | + ; do git clone $i; done | ||
150 | + | ||
151 | + wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | ||
152 | + unzip pdftk-2.02-src.zip | ||
153 | + rm -f pdftk-2.02-src.zip | ||
154 | + | ||
155 | +# pdftk, versão 2.02 ou superior | ||
156 | +cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | ||
157 | + | ||
158 | +# Ghostscript 9.18 ou superior | ||
159 | +#wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz | ||
160 | +#tar xvozf ghostscript-9.21.tar.gz | ||
161 | +#rm -f ghostscript-9.21.tar.gz | ||
162 | +#cd ghostscript-9.21 | ||
163 | +cd ghostpdl | ||
164 | +./autogen.sh; ./configure | ||
165 | +make all install | ||
166 | +cd .. | ||
167 | + | ||
168 | +# Centos 6.9 | ||
169 | +# \_ Cria um novo shell usando o GCC 4.8 por default | ||
170 | +scl enable devtoolset-2 bash | ||
171 | + | ||
172 | +# Tesseract, versão 3.05-dev ou superior | ||
173 | +# Bibliotecas para o Tesseract: Leptonica e Libav | ||
174 | +cd leptonica && ./autobuild && ./configure && make all install && cd .. | ||
175 | + | ||
176 | +# Para compilação do Tesseract após a compilação do leptonica | ||
177 | +export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | ||
178 | + | ||
179 | +cd libav && ./configure --enable-sram && make all install && cd .. | ||
180 | + | ||
181 | +# Tesseract | ||
182 | +cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | ||
183 | +cp -avR tessdata/* /usr/local/share/tessdata/ | ||
184 | + | ||
185 | +# cpdf, versão 2.1 ou superior | ||
186 | +cd ocaml && ./configure && make world.opt && make install && cd .. | ||
187 | +mkdir -p /usr/local/man/man5 | ||
188 | +# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | ||
189 | +cd lib-findlib && ./configure && make all && make install && cd .. | ||
190 | +cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | ||
191 | +cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | ||
192 | + | ||
193 | +# poppler-utils, versão 0.42.0 ou superior | ||
194 | +cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd .. | ||
195 | + | ||
196 | +# Centos 6.9 | ||
197 | +# \_ Termina o shell usando o GCC 4.8 por default | ||
198 | +exit | ||
199 | + | ||
200 | +# ----------------------- INSTALAÇÃO (obs.: os comandos devem ser executados como root) | ||
201 | + | ||
202 | +## Comandos adicionais para configuração do módulo: | ||
203 | + | ||
204 | +# Criação do usuário | ||
205 | +adduser ocr | ||
206 | + | ||
207 | +# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | ||
208 | +cp ./usr/local/bin/ocr /usr/local/bin | ||
209 | + | ||
210 | +# Auto start (RedHat 6.7 e CentOs 6.9) | ||
211 | +cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | ||
212 | +mv /etc | ||
213 | +chkconfig --add ocr | ||
214 | +chkconfig --level 2345 ocr on | ||
215 | + | ||
216 | +# Auto start (Ubuntu 14.04) | ||
217 | +cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | ||
218 | +update-rd.d ocr defaults | ||
219 | + | ||
220 | +# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | ||
221 | +cd /home/ocr | ||
222 | +tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | ||
223 | +su | ||
224 | + | ||
225 | +# Copie o pacote para os outros servidores e extraia com: | ||
226 | +cd / | ||
227 | +tar xovzf pkg-ocr.tgz | ||
228 | + | ||
229 | +# Instalando pré-requisitos RUNTIME em servidores adicionais | ||
230 | + | ||
231 | +# Redhat 6.7 e CentOS 6.9 | ||
232 | +yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript | ||
233 | +yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | ||
234 | + | ||
235 | +# Ubuntu 14.04 | ||
236 | +apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | ||
237 | +apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript | ||
238 | + | ||
239 | +# Inicie o serviço com | ||
240 | +service ocr start |
usr/local/bin/ocr
1 | -#! /usr/bin/perl -w | 1 | +#!/usr/bin/perl -w |
2 | # | 2 | # |
3 | -# OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes | 3 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes |
4 | # | 4 | # |
5 | # This script monitors a set of input directories for PDF files | 5 | # This script monitors a set of input directories for PDF files |
6 | # once a new file is detected, it is processes through tesseract OCR | 6 | # once a new file is detected, it is processes through tesseract OCR |
@@ -24,15 +24,38 @@ | @@ -24,15 +24,38 @@ | ||
24 | # 1.0.1 Solving error when file has no images | 24 | # 1.0.1 Solving error when file has no images |
25 | # 1.0.2 Fix bug when counting cores for AMD processors | 25 | # 1.0.2 Fix bug when counting cores for AMD processors |
26 | # 1.0.3 Added better image type detection | 26 | # 1.0.3 Added better image type detection |
27 | -# 1.0.4 Fix: added ubuntu init script | 27 | +# 1.0.4 Fix: added ubuntu init script |
28 | +# 1.0.4b Add Centos 6.9 install instructions | ||
29 | +# 2.0 PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is | ||
30 | +# strongly recomended | ||
31 | +# Rewritten image extration, processing and transformations process | ||
32 | +# Check if input file is signed, in this case, does not change the file contents | ||
33 | +# Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | ||
34 | +# Use operating system packges by default | ||
35 | +# Changed paths from external programs, instead of using full paths, uses first match from $PATH | ||
36 | +# Check existence of external programs on path before running | ||
37 | +# Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | ||
38 | +# Fix: create subpaths on error folder | ||
39 | +# Fix: trying to reduce overhead on temporary folder | ||
28 | # | 40 | # |
29 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | 41 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
30 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | ||
31 | -# diferently but does not treat it adequately | 42 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them |
43 | +# diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | ||
32 | # - Review poppler and cpdf install instructions | 44 | # - Review poppler and cpdf install instructions |
33 | # - Add better handling of vectorized and non scanned pdf files | 45 | # - Add better handling of vectorized and non scanned pdf files |
34 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | ||
35 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | 46 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current |
47 | +# scalling, cropping and rotation handlers | ||
48 | +# - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- | ||
49 | +# added function to analyse image color histogram -> just need to add option to convert it to B&W. | ||
50 | +# - Move all parameters to config file | ||
51 | +# - Add some job control web interface | ||
52 | +# - Add end user interface to submit files through web | ||
53 | +# - Add check external programs version requirements before running | ||
54 | +# | ||
55 | +# BUGS: - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than | ||
56 | +# original, this is due to using pdftoppm instead of pdfimages | ||
57 | +# - Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions | ||
58 | +# increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server | ||
36 | # | 59 | # |
37 | # Check software requirements on the comments bellow | 60 | # Check software requirements on the comments bellow |
38 | # | 61 | # |
@@ -54,8 +77,8 @@ use Sys::Hostname; | @@ -54,8 +77,8 @@ use Sys::Hostname; | ||
54 | use IPC::Open3; | 77 | use IPC::Open3; |
55 | use IO::Select; | 78 | use IO::Select; |
56 | 79 | ||
57 | -my $DEBUG = 0; | ||
58 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | 80 | +my $DEBUG = 2; |
81 | +my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | ||
59 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; | 82 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
60 | 83 | ||
61 | my $USER = 'ocr'; | 84 | my $USER = 'ocr'; |
@@ -63,23 +86,28 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca | @@ -63,23 +86,28 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca | ||
63 | 86 | ||
64 | # Command dependencies | 87 | # Command dependencies |
65 | 88 | ||
66 | -# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher | ||
67 | -my $TESSERACT = '/usr/local/bin/tesseract -l por+eng'; | 89 | +# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended |
90 | +my $TESSERACT = 'tesseract --oem 0'; # if Tesseract => 4.0 | ||
91 | +#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0 | ||
68 | 92 | ||
69 | # Depends on pdftk 2.02 or higher | 93 | # Depends on pdftk 2.02 or higher |
70 | -my $PDFTK = '/usr/local/bin/pdftk'; | 94 | +my $PDFTK = 'pdftk'; |
71 | 95 | ||
72 | # Depends on poppler-utils 0.42.0 or higher | 96 | # Depends on poppler-utils 0.42.0 or higher |
73 | -#my $PDINFO = '/usr/local/bin/pdfinfo'; | ||
74 | -my $PDFFONTS = '/usr/local/bin/pdffonts'; | ||
75 | -my $PDFIMAGES = '/usr/local/bin/pdfimages'; | ||
76 | -my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | 97 | +my $PDFFONTS = 'pdffonts'; |
98 | +my $PDFIMAGES = 'pdfimages'; | ||
99 | +my $PDFTOPPM = 'pdftoppm'; | ||
100 | +my $PDFUNITE = 'pdfunite'; | ||
101 | +my $PDFSIG = 'pdfsig'; | ||
77 | 102 | ||
78 | # Depends on cpdf 2.1 or higher | 103 | # Depends on cpdf 2.1 or higher |
79 | -my $CPDF = '/usr/local/bin/cpdf'; | 104 | +my $CPDF = 'cpdf'; |
105 | + | ||
106 | +# Depends on Ghostscript 9.18 | ||
107 | +my $GS = 'gs'; | ||
80 | 108 | ||
81 | ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner | 109 | ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner |
82 | -my $CONVERT = '/usr/bin/convert'; | 110 | +my $CONVERT = 'convert'; |
83 | 111 | ||
84 | # If it is needed further filtering | 112 | # If it is needed further filtering |
85 | #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | 113 | #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; |
@@ -90,12 +118,14 @@ my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | @@ -90,12 +118,14 @@ my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | ||
90 | my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); | 118 | my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); |
91 | 119 | ||
92 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); | 120 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
93 | -%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); | 121 | +%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2); |
94 | 122 | ||
95 | # Safeguard im case of cpuinfo has not identified correctly the number of CPUs | 123 | # Safeguard im case of cpuinfo has not identified correctly the number of CPUs |
96 | $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; | 124 | $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; |
97 | 125 | ||
98 | -$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; | 126 | +$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin'; |
127 | +$ENV{'IFS'} = '\t\n'; | ||
128 | + | ||
99 | my ($host) = split/\./,hostname; | 129 | my ($host) = split/\./,hostname; |
100 | 130 | ||
101 | use vars qw/*name *dir *prune/; | 131 | use vars qw/*name *dir *prune/; |
@@ -107,14 +137,15 @@ sub main; | @@ -107,14 +137,15 @@ sub main; | ||
107 | sub get_pages; | 137 | sub get_pages; |
108 | sub get_rotation; | 138 | sub get_rotation; |
109 | sub get_res; | 139 | sub get_res; |
110 | -sub is_ocred; | ||
111 | sub is_locked_ex; | 140 | sub is_locked_ex; |
112 | 141 | ||
113 | 142 | ||
114 | my $expr = 'use POSIX qw(setsid)'; | 143 | my $expr = 'use POSIX qw(setsid)'; |
115 | 144 | ||
116 | my ($dumb1, $dumb2, $uid) = getpwnam ($USER); | 145 | my ($dumb1, $dumb2, $uid) = getpwnam ($USER); |
117 | -setuid ($uid) or warn "Cant set uid $uid"; | 146 | +if (defined $uid) { |
147 | + setuid ($uid) or warn "Cant set uid $uid"; | ||
148 | +} | ||
118 | 149 | ||
119 | $SIG{__DIE__} = 'DEFAULT'; | 150 | $SIG{__DIE__} = 'DEFAULT'; |
120 | $SIG{__WARN__} = \&die_when_called; | 151 | $SIG{__WARN__} = \&die_when_called; |
@@ -126,6 +157,11 @@ if ($@) { | @@ -126,6 +157,11 @@ if ($@) { | ||
126 | chdir('/') or die "$0: cannot chdir '/': $!\n"; | 157 | chdir('/') or die "$0: cannot chdir '/': $!\n"; |
127 | open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; | 158 | open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; |
128 | 159 | ||
160 | +foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) { | ||
161 | + die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0); | ||
162 | +} | ||
163 | + | ||
164 | + | ||
129 | foreach my $DIR (@BASE_DIRS) { | 165 | foreach my $DIR (@BASE_DIRS) { |
130 | 166 | ||
131 | defined(my $pid = fork) or die "$0: cannot fork: $!\n"; | 167 | defined(my $pid = fork) or die "$0: cannot fork: $!\n"; |
@@ -135,7 +171,7 @@ foreach my $DIR (@BASE_DIRS) { | @@ -135,7 +171,7 @@ foreach my $DIR (@BASE_DIRS) { | ||
135 | main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); | 171 | main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); |
136 | exit 0; | 172 | exit 0; |
137 | last; | 173 | last; |
138 | - } | 174 | + } |
139 | } | 175 | } |
140 | 176 | ||
141 | exit 0; | 177 | exit 0; |
@@ -157,7 +193,7 @@ sub main { | @@ -157,7 +193,7 @@ sub main { | ||
157 | # remove .tmp file | 193 | # remove .tmp file |
158 | unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); | 194 | unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); |
159 | 195 | ||
160 | - # Rename files that were in 'processig' back | 196 | + # Rename files that were in 'processing' state back |
161 | foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { | 197 | foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { |
162 | my $old_name = $file; | 198 | my $old_name = $file; |
163 | $old_name =~ s/\.${host}\.processing$//g; | 199 | $old_name =~ s/\.${host}\.processing$//g; |
@@ -177,12 +213,14 @@ sub main { | @@ -177,12 +213,14 @@ sub main { | ||
177 | # Main loop | 213 | # Main loop |
178 | while ( 1 ) { | 214 | while ( 1 ) { |
179 | select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced | 215 | select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced |
216 | + | ||
180 | $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); | 217 | $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); |
181 | print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; | 218 | print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; |
182 | $count = scalar keys %files_in; | 219 | $count = scalar keys %files_in; |
183 | - foreach my $file (keys %files_in) { | ||
184 | 220 | ||
185 | - next if ( glob ("$file.*.tmp")); | 221 | + foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) { |
222 | + | ||
223 | + next if ( glob ("\"$file.*.tmp\"")); | ||
186 | 224 | ||
187 | select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds | 225 | select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds |
188 | next if (!defined $files_in{$file}); # continue only if it is still valid | 226 | next if (!defined $files_in{$file}); # continue only if it is still valid |
@@ -255,7 +293,7 @@ sub ocr { | @@ -255,7 +293,7 @@ sub ocr { | ||
255 | remove_tree ($tmpdir,{ error=> \my $dumb }); | 293 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
256 | unlink ("$in_file.$host.tmp"); | 294 | unlink ("$in_file.$host.tmp"); |
257 | move ( "$in_file.$host.processing", $in_file); | 295 | move ( "$in_file.$host.processing", $in_file); |
258 | - exit 0; | 296 | + exit 1; |
259 | }; | 297 | }; |
260 | 298 | ||
261 | my $out_path = $in_path; | 299 | my $out_path = $in_path; |
@@ -271,7 +309,7 @@ sub ocr { | @@ -271,7 +309,7 @@ sub ocr { | ||
271 | my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); | 309 | my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); |
272 | 310 | ||
273 | print "\twritting to $out_file\n" if $DEBUG; | 311 | print "\twritting to $out_file\n" if $DEBUG; |
274 | - | 312 | + |
275 | my $stime = time; | 313 | my $stime = time; |
276 | my %pids; | 314 | my %pids; |
277 | 315 | ||
@@ -291,8 +329,26 @@ sub ocr { | @@ -291,8 +329,26 @@ sub ocr { | ||
291 | remove_tree ($tmpdir,{ error=> \my $dumb }); | 329 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
292 | unlink ("$in_file.$host.tmp"); | 330 | unlink ("$in_file.$host.tmp"); |
293 | move ( "$in_file.$host.processing", $in_file); | 331 | move ( "$in_file.$host.processing", $in_file); |
332 | + print "Error: cannot copy $in_file to temp dir \n" if $DEBUG; | ||
333 | + syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG; | ||
334 | + exit 1; | ||
294 | }; | 335 | }; |
295 | 336 | ||
337 | + # Check if file was signed | ||
338 | + if (get_sign($tmp_file)) { | ||
339 | + if (!copy ("$in_file.$host.processing", $proc_file)) { | ||
340 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | ||
341 | + unlink ("$in_file.$host.tmp"); | ||
342 | + move ( "$in_file.$host.processing", $in_file); | ||
343 | + }; | ||
344 | + move ("$in_file.$host.processing", $out_file); | ||
345 | + unlink ("$in_file.$host.tmp"); | ||
346 | + print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG; | ||
347 | + syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG; | ||
348 | + | ||
349 | + exit 0; | ||
350 | + } | ||
351 | + | ||
296 | # Extract pages | 352 | # Extract pages |
297 | ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); | 353 | ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); |
298 | if ($DEBUG) { | 354 | if ($DEBUG) { |
@@ -301,12 +357,13 @@ sub ocr { | @@ -301,12 +357,13 @@ sub ocr { | ||
301 | print "\t\t\t$_" for @err ; | 357 | print "\t\t\t$_" for @err ; |
302 | }; | 358 | }; |
303 | 359 | ||
360 | + my ($pages, @pg_w, @pg_h, @pg_r, @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2); | ||
361 | + $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2); | ||
304 | 362 | ||
305 | - my ($pages, @pg_w, @pg_h, @pg_r); | ||
306 | - $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r); | 363 | + my ($imgs,@page_img, @img_w, @img_h, @img_t, @img_xppi, @img_yppi); |
364 | + $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi); | ||
307 | 365 | ||
308 | - my ($imgs,@page_img, @img_w, @img_h, @img_t); | ||
309 | - $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t); | 366 | + unlink ($tmp_file) if (!$DEBUG); |
310 | 367 | ||
311 | for ( my $i=0; $i< $pages; $i++ ) { | 368 | for ( my $i=0; $i< $pages; $i++ ) { |
312 | my $pg = sprintf ("pg_%06d", $i+1); | 369 | my $pg = sprintf ("pg_%06d", $i+1); |
@@ -333,25 +390,29 @@ sub ocr { | @@ -333,25 +390,29 @@ sub ocr { | ||
333 | if (! defined $img_t[$i] ) { | 390 | if (! defined $img_t[$i] ) { |
334 | move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); | 391 | move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); |
335 | print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; | 392 | print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; |
336 | - exit 0; | 393 | + exit -1; |
337 | } | 394 | } |
338 | 395 | ||
339 | - print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG; | 396 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG; |
397 | + print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG); | ||
398 | + print "\n" if ($DEBUG); | ||
340 | 399 | ||
400 | + # Extract images from page, since 2.0 uses png lossless format regardless of original format or depth | ||
341 | undef $cmd; | 401 | undef $cmd; |
342 | 402 | ||
343 | - if ($img_t[$i] eq "gray") { | ||
344 | - $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | 403 | + # Use PDFIMAGES and JPEG by default |
404 | + $cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
405 | + | ||
406 | + if ($img_t[$i] eq "stencil") { | ||
407 | + $cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
345 | } | 408 | } |
346 | 409 | ||
347 | - if ($img_t[$i] eq "rgb") { | ||
348 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
349 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | 410 | + if ($img_t[$i] eq "gray") { |
411 | + $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
350 | } | 412 | } |
351 | 413 | ||
352 | - if (!defined $cmd) { | ||
353 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
354 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | 414 | + if ($img_t[$i] !~ /gray|rgb|stencil/) { |
415 | + $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
355 | } | 416 | } |
356 | 417 | ||
357 | ($exit,$cmd,@out,@err) = exec_cmd($cmd); | 418 | ($exit,$cmd,@out,@err) = exec_cmd($cmd); |
@@ -362,7 +423,13 @@ sub ocr { | @@ -362,7 +423,13 @@ sub ocr { | ||
362 | }; | 423 | }; |
363 | 424 | ||
364 | # Process each resulting image for page pdf | 425 | # Process each resulting image for page pdf |
365 | - my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ; | 426 | + my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ; |
427 | + | ||
428 | + if (scalar @images == 0) { | ||
429 | + move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); | ||
430 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG; | ||
431 | + exit 0; | ||
432 | + } | ||
366 | 433 | ||
367 | foreach my $image (@images) { | 434 | foreach my $image (@images) { |
368 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; | 435 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
@@ -378,43 +445,65 @@ sub ocr { | @@ -378,43 +445,65 @@ sub ocr { | ||
378 | print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; | 445 | print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; |
379 | } | 446 | } |
380 | } | 447 | } |
381 | - | ||
382 | - # Check if page was rotated | ||
383 | - if ($pg_r[$i]) { | ||
384 | - print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG; | ||
385 | - ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\""); | 448 | + |
449 | + # Check if page was rotated and extracted with pdftoppm | ||
450 | + if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) { | ||
451 | + print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG; | ||
452 | + ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\""); | ||
386 | if ($DEBUG) { | 453 | if ($DEBUG) { |
387 | print "\t\t\t${image} -> $cmd: $exit\n"; | 454 | print "\t\t\t${image} -> $cmd: $exit\n"; |
388 | print "\t\t\t\t$_" for @out ; | 455 | print "\t\t\t\t$_" for @out ; |
389 | print "\t\t\t\t$_" for @err ; | 456 | print "\t\t\t\t$_" for @err ; |
390 | }; | 457 | }; |
391 | } | 458 | } |
392 | - | 459 | + |
393 | # Filter ppm images, if needed | 460 | # Filter ppm images, if needed |
394 | 461 | ||
395 | # OCR ppm images to pdf pages | 462 | # OCR ppm images to pdf pages |
396 | - ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf"); | 463 | + ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf"); |
397 | if ($DEBUG) { | 464 | if ($DEBUG) { |
398 | print "\t\t\t${image} -> $cmd: $exit\n"; | 465 | print "\t\t\t${image} -> $cmd: $exit\n"; |
399 | print "\t\t\t\t$_" for @out ; | 466 | print "\t\t\t\t$_" for @out ; |
400 | print "\t\t\t\t$_" for @err ; | 467 | print "\t\t\t\t$_" for @err ; |
401 | }; | 468 | }; |
469 | + unlink ("$image") if (!$DEBUG); | ||
402 | 470 | ||
403 | - # Scale to fit pdf | ||
404 | - ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | 471 | + # Scale, crop and rotate to fit pdf |
472 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | ||
405 | if ($DEBUG) { | 473 | if ($DEBUG) { |
406 | print "\t\t\t${image} -> $cmd: $exit\n"; | 474 | print "\t\t\t${image} -> $cmd: $exit\n"; |
407 | print "\t\t\t\t$_" for @out ; | 475 | print "\t\t\t\t$_" for @out ; |
408 | print "\t\t\t\t$_" for @err ; | 476 | print "\t\t\t\t$_" for @err ; |
409 | }; | 477 | }; |
478 | + unlink ("$image.pdf") if (!$DEBUG); | ||
410 | 479 | ||
480 | + if (defined $pg_crop_x1[$i]) { | ||
481 | + # adjust cropbox | ||
482 | + ($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = ( | ||
483 | + ($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]), | ||
484 | + ($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]), | ||
485 | + abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i]) | ||
486 | + ); | ||
487 | + | ||
488 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | ||
489 | + if ($DEBUG) { | ||
490 | + print "\t\t\t${image} -> $cmd: $exit\n"; | ||
491 | + print "\t\t\t\t$_" for @out ; | ||
492 | + print "\t\t\t\t$_" for @err ; | ||
493 | + }; | ||
494 | + } | ||
495 | + | ||
496 | + if ($pg_r[$i]) { | ||
497 | + ($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | ||
498 | + if ($DEBUG) { | ||
499 | + print "\t\t\t${image} -> $cmd: $exit\n"; | ||
500 | + print "\t\t\t\t$_" for @out ; | ||
501 | + print "\t\t\t\t$_" for @err ; | ||
502 | + }; | ||
503 | + } | ||
411 | 504 | ||
412 | - unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG); | ||
413 | - unlink ("$image.pdf") if (!$DEBUG); | ||
414 | - move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG); | ||
415 | - unlink ("$image") if (!$DEBUG); | ||
416 | } | 505 | } |
417 | - exit 0; | 506 | + exit 1; |
418 | } | 507 | } |
419 | } | 508 | } |
420 | 509 | ||
@@ -427,28 +516,51 @@ sub ocr { | @@ -427,28 +516,51 @@ sub ocr { | ||
427 | 516 | ||
428 | if (scalar @new_pages != $pages) { | 517 | if (scalar @new_pages != $pages) { |
429 | print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); | 518 | print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); |
430 | - syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG); | 519 | + syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG); |
431 | unlink "$in_file.$host.tmp"; | 520 | unlink "$in_file.$host.tmp"; |
521 | + make_path ($error_path) if ( ! -d $error_path); | ||
432 | move ("$in_file.$host.processing", $error_file); | 522 | move ("$in_file.$host.processing", $error_file); |
433 | - exit (0); | 523 | + exit (1); |
434 | } | 524 | } |
435 | 525 | ||
436 | - # Merge resulting pdf pages to a single pdf | 526 | + # Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output |
437 | make_path ($out_path) if ( ! -d $out_path); | 527 | make_path ($out_path) if ( ! -d $out_path); |
438 | unlink $out_file if ( -f $out_file ); | 528 | unlink $out_file if ( -f $out_file ); |
439 | - ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress"); | 529 | + |
530 | + chdir (${tmpdir}); | ||
531 | + ($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\" pg_*-cpdf.pdf "); | ||
440 | if ($DEBUG) { | 532 | if ($DEBUG) { |
441 | print "\t\t${out_file} -> $cmd: $exit\n"; | 533 | print "\t\t${out_file} -> $cmd: $exit\n"; |
442 | print "\t\t\t$_" for @out ; | 534 | print "\t\t\t$_" for @out ; |
443 | print "\t\t\t$_" for @err ; | 535 | print "\t\t\t$_" for @err ; |
444 | }; | 536 | }; |
537 | + if ($exit) { | ||
538 | + unlink "$in_file.$host.tmp"; | ||
539 | + unlink $out_file; | ||
540 | + make_path ($error_path) if ( ! -d $error_path); | ||
541 | + move ("$in_file.$host.processing", $error_file); | ||
542 | + print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); | ||
543 | + syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG); | ||
544 | + exit (1); | ||
545 | + } | ||
546 | + chdir ("/"); | ||
547 | + | ||
548 | + if (!copy (${tmp_file}, $out_file)) { | ||
549 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | ||
550 | + unlink ("$in_file.$host.tmp"); | ||
551 | + unlink $out_file; | ||
552 | + make_path ($error_path) if ( ! -d $error_path); | ||
553 | + move ("$in_file.$host.processing", $error_file); | ||
554 | + print "Error: cannot copy temp file to $out_file \n" if $DEBUG; | ||
555 | + syslog ("error","cannot copy temp file to $out_file") if !$DEBUG; | ||
556 | + exit 1; | ||
557 | + }; | ||
445 | 558 | ||
446 | make_path ($proc_path) if ( ! -d $proc_path); | 559 | make_path ($proc_path) if ( ! -d $proc_path); |
447 | unlink $proc_file if ( -f $proc_file ); | 560 | unlink $proc_file if ( -f $proc_file ); |
448 | move ("$in_file.$host.processing", $proc_file); | 561 | move ("$in_file.$host.processing", $proc_file); |
449 | move ("${out_file}.tmp", ${out_file}); | 562 | move ("${out_file}.tmp", ${out_file}); |
450 | 563 | ||
451 | - | ||
452 | # Remove temp dir | 564 | # Remove temp dir |
453 | remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); | 565 | remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); |
454 | unlink $tmp_file if (!$DEBUG); | 566 | unlink $tmp_file if (!$DEBUG); |
@@ -471,7 +583,7 @@ sub is_ocred { | @@ -471,7 +583,7 @@ sub is_ocred { | ||
471 | } | 583 | } |
472 | 584 | ||
473 | sub get_pages { | 585 | sub get_pages { |
474 | - my ($in_file, $w, $h, $r) = @_; | 586 | + my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_; |
475 | 587 | ||
476 | my $pages=0; | 588 | my $pages=0; |
477 | my $i=0; | 589 | my $i=0; |
@@ -485,29 +597,35 @@ sub get_pages { | @@ -485,29 +597,35 @@ sub get_pages { | ||
485 | ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); | 597 | ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); |
486 | ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); | 598 | ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); |
487 | ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); | 599 | ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); |
488 | - ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); | 600 | + ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); |
601 | + ($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ ); | ||
489 | } | 602 | } |
490 | 603 | ||
491 | return $pages; | 604 | return $pages; |
492 | } | 605 | } |
493 | 606 | ||
494 | sub get_imgs { | 607 | sub get_imgs { |
495 | - my ($in_file, $page_img, $w, $h, $t) = @_; | ||
496 | - my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); | 608 | + my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_; |
609 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi ); | ||
497 | 610 | ||
498 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); | 611 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
612 | + $i = 0; | ||
499 | 613 | ||
500 | foreach my $line (@lines) { | 614 | foreach my $line (@lines) { |
501 | chomp $line; | 615 | chomp $line; |
502 | $line =~ s/^ {1,}//; | 616 | $line =~ s/^ {1,}//; |
503 | - if ( $line =~ /image|mask/ ) { | ||
504 | - ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; | 617 | + if ( $line !~ /^page|^----/ ) { |
618 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line; | ||
505 | @$page_img[$page-1]=$i; | 619 | @$page_img[$page-1]=$i; |
506 | @$w[$page-1] = $width; | 620 | @$w[$page-1] = $width; |
507 | @$h[$page-1] = $height; | 621 | @$h[$page-1] = $height; |
508 | @$t[$page-1] = "rgb"; # Default is color | 622 | @$t[$page-1] = "rgb"; # Default is color |
509 | - @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | ||
510 | @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); | 623 | @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); |
624 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | ||
625 | + @$t[$page-1] = ( $type eq "stencil" ? $type : @$t[$page-1]); | ||
626 | + @$t[$page-1] = ( $enc eq "image" ? $enc : @$t[$page-1]); | ||
627 | + @$x_ppi[$page-1] = $xppi; | ||
628 | + @$y_ppi[$page-1] = $yppi; | ||
511 | } | 629 | } |
512 | } | 630 | } |
513 | return $i+1; | 631 | return $i+1; |
@@ -542,6 +660,19 @@ sub get_res { | @@ -542,6 +660,19 @@ sub get_res { | ||
542 | return ($res_x,$res_y); | 660 | return ($res_x,$res_y); |
543 | } | 661 | } |
544 | 662 | ||
663 | +sub get_sign { | ||
664 | + my ($in_file) = @_; | ||
665 | + my @lines = `${PDFSIG} \"${in_file}\" 2>/dev/null`; | ||
666 | + | ||
667 | + foreach (@lines) { | ||
668 | + chomp; | ||
669 | + if ( $_ =~ /^Signature/ ) { | ||
670 | + return 1; | ||
671 | + } | ||
672 | + } | ||
673 | + return 0; | ||
674 | +} | ||
675 | + | ||
545 | sub is_locked_ex { | 676 | sub is_locked_ex { |
546 | my ($path) = @_; | 677 | my ($path) = @_; |
547 | 678 |
workflow.pdf
No preview for this file type
workflow.vsd
No preview for this file type