Commit f02dd8e43c99cc7ed8312d8ace23cc29f208aebb
Exists in
master
Merge branch 'Pre_versao_2.0' into 'master'
Pre versao 2.0 final para gerar Tag 2.0 Final See merge request !4
Showing
7 changed files
with
583 additions
and
272 deletions
Show diff stats
... | ... | @@ -0,0 +1,103 @@ |
1 | + | |
2 | +FROM ubuntu:14.04 | |
3 | + | |
4 | +# Cópia de arquivos do projeto OCR-SERVER | |
5 | +COPY usr/local/bin/ocr /usr/local/bin/ocr | |
6 | +COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr | |
7 | +COPY entrypoint.sh /entrypoint.sh | |
8 | + | |
9 | +WORKDIR /tmp | |
10 | + | |
11 | +# Instalação dos pacotes pré-requisitos do ocr-server 2 | |
12 | +RUN apt-get -y update && \ | |
13 | + apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \ | |
14 | + curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \ | |
15 | + zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \ | |
16 | + wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \ | |
17 | + libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \ | |
18 | + libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \ | |
19 | + rsyslog libsys-syslog-perl && \ | |
20 | + apt-get -y clean all | |
21 | + | |
22 | +RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \ | |
23 | + dpkg -i mscorefonts.deb && \ | |
24 | + rm mscorefonts.deb | |
25 | + | |
26 | +# Instalação do Perl 5.1 e demais módulos | |
27 | +RUN perl -MCPAN -e 'install File::Touch' | |
28 | +RUN perl -MCPAN -e 'install File::Find::Rule;' | |
29 | +RUN perl -MCPAN -e 'install File::Touch;' | |
30 | +RUN perl -MCPAN -e 'install Sys::Syslog;' | |
31 | +RUN perl -MCPAN -e 'install IPC::Open3;' | |
32 | +RUN perl -MCPAN -e 'install IO::Select;' | |
33 | + | |
34 | +# Tesseract-ocr 3.05, com dicionários inglês e português | |
35 | +# Bibliotecas para o Tesseract: Leptonica | |
36 | +RUN git clone https://github.com/DanBloomberg/leptonica.git && \ | |
37 | + cd leptonica && ./autobuild && ./configure && make all install && \ | |
38 | + rm -rf ../leptonica | |
39 | + | |
40 | +# Bibliotecas para o Tesseract: Libav | |
41 | +RUN git clone https://github.com/libav/libav.git && \ | |
42 | + export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \ | |
43 | + cd libav && ./configure --enable-sram && make all install && \ | |
44 | + rm -rf ../libav | |
45 | + | |
46 | +# Tesseract 3.05.01 | |
47 | +RUN git clone https://github.com/tesseract-ocr/tesseract.git && \ | |
48 | + cd tesseract && ./autogen.sh && ./configure && make all install && \ | |
49 | + rm -rf ../tesseract | |
50 | + | |
51 | +RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \ | |
52 | + wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \ | |
53 | + wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata | |
54 | + | |
55 | +# Poppler 0.56 | |
56 | +RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \ | |
57 | + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && \ | |
58 | + rm -rf ../poppler | |
59 | + | |
60 | +# pdftk, versão 2.02 ou superior | |
61 | +RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \ | |
62 | + unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \ | |
63 | + cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \ | |
64 | + rm -rf ../pdftk-2.02-dist | |
65 | + | |
66 | +# Ghostscript 9.18 ou superior | |
67 | +RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \ | |
68 | + tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \ | |
69 | + cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \ | |
70 | + rm -rf ../ghostscript-9.18 | |
71 | + | |
72 | +# CPDF Intel OS X v 2.2 | |
73 | +RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \ | |
74 | + cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin | |
75 | + | |
76 | +# Atualização das configurações do ld | |
77 | +RUN ldconfig | |
78 | + | |
79 | +RUN useradd -m ocr | |
80 | + | |
81 | +RUN chmod +x /usr/local/bin/ocr && \ | |
82 | + chmod +x /etc/init.d/ocr && \ | |
83 | + update-rc.d ocr defaults | |
84 | + | |
85 | +RUN mkdir /var/ocr-server/ && \ | |
86 | + mkdir -p /var/ocr-server/Entrada && \ | |
87 | + mkdir -p /var/ocr-server/Saida && \ | |
88 | + mkdir -p /var/ocr-server/Originais_Processados && \ | |
89 | + mkdir -p /var/ocr-server/Erro && \ | |
90 | + chmod +x /entrypoint.sh | |
91 | + | |
92 | +RUN mkdir -p /tmp/ocr_dev/ && \ | |
93 | + mkdir -p /tmp/ocr_dev/Entrada && \ | |
94 | + mkdir -p /tmp/ocr_dev/Saida && \ | |
95 | + mkdir -p /tmp/ocr_dev/Originais_Processados && \ | |
96 | + mkdir -p /tmp/ocr_dev/Erro && \ | |
97 | + chmod -R 777 /tmp/ocr_dev | |
98 | + | |
99 | +WORKDIR / | |
100 | + | |
101 | +VOLUME /var/ocr-server/ | |
102 | + | |
103 | +CMD ["bash", "/entrypoint.sh"] | |
0 | 104 | \ No newline at end of file | ... | ... |
INSTALL.txt
... | ... | @@ -1,202 +0,0 @@ |
1 | -# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees | |
2 | -# | |
3 | -# This script monitors a set of input directories for PDF files | |
4 | -# once a new file is detected, it is processes through tesseract OCR | |
5 | -# in order to generate a new file with a hidden searchable text layer | |
6 | -# | |
7 | -# It may be distributed under the conditions of the LGPL v2.1 license. | |
8 | -# | |
9 | -# Author: Guilherme Chehab | |
10 | -# | |
11 | -# Version History: | |
12 | -# 0.1 Initial single server version | |
13 | -# 0.2 Check if page already has the html hidden layer, if so, ignore it | |
14 | -# 0.3 Solved issues about various image enconding types | |
15 | -# 0.4 Added a postnormalization step to ensure all output pdf pages have | |
16 | -# the same size and orientations as the original files | |
17 | -# 0.5 Used input file renaming as a way to sync multiple parallel instances, | |
18 | -# that way, it is minimized the risk of same file being OCRed multiple times. | |
19 | -# 0.6 Added a default handler for unknown image encoding using jpeg encoding | |
20 | -# 0.7 Solved an issue with files with more than 1000 pages | |
21 | -# 1.0 First release version | |
22 | -# 1.0.1 Solving error when file has no images | |
23 | -# 1.0.2 Fix bug when counting cores for AMD processors | |
24 | -# 1.0.3 Added better image type detection | |
25 | -# 1.0.4 Fix: added ubuntu init script | |
26 | -# 1.0.4b Centos 6.9 | |
27 | -# | |
28 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | |
29 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
30 | -# diferently but does not treat it adequately | |
31 | -# - Review poppler and cpdf install instructions | |
32 | -# - Add better handling of vectorized and non scanned pdf files | |
33 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
34 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
35 | -# | |
36 | -# Check software requirements on the comments bellow | |
37 | -# | |
38 | -# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables | |
39 | -# | |
40 | -# | |
41 | -# O servidor OCR depende dos seguintes componentes: | |
42 | -# - Perl 5.10.1, com seguintes módulos: | |
43 | -# - File::Find::Rule | |
44 | -# - File::Basename | |
45 | -# - File::Copy | |
46 | -# - File::Path | |
47 | -# - File::Touch | |
48 | -# - Sys::Syslog | |
49 | -# - Sys::Hostname | |
50 | -# - IPC::Open3 | |
51 | -# - IO::Select | |
52 | -# - POSIX | |
53 | -# - Tesseract-ocr 3.05, com dicionários inglês e português | |
54 | -# - Pdftk 2.02 | |
55 | -# - Poppler-utils 0.42.0 | |
56 | -# - Cpdf 2.1 | |
57 | -# - ImageMagick 6.7.2-7 | |
58 | -# | |
59 | -# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | |
60 | -# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | |
61 | -# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | |
62 | -# | |
63 | -## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | |
64 | -# | |
65 | -# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | |
66 | -# | |
67 | -# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | |
68 | -# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | |
69 | -# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | |
70 | -# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | |
71 | -# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | |
72 | -# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | |
73 | -# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | |
74 | -# | |
75 | -# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | |
76 | -# | |
77 | -# | |
78 | -# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root) | |
79 | -# | |
80 | -# | |
81 | -# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | |
82 | -# | |
83 | -# RedHat 6.7 e Centos 6.9: | |
84 | -yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | |
85 | -yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel | |
86 | -cd /tmp | |
87 | -wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | |
88 | -rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | |
89 | -rm -f msttcore-fonts-2.0-3.noarch.rpm | |
90 | - | |
91 | -# Centos 6.9 | |
92 | -# \_ autoconf-archive | |
93 | -wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | |
94 | -rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | |
95 | -rm autoconf-archive-2012.04.07-7.3.noarch.rpm | |
96 | -# \_ GCC 4.8 | |
97 | -wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | |
98 | -yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | |
99 | - | |
100 | -# Ubuntu 14.04 Server: | |
101 | -apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | |
102 | -apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev | |
103 | -apt-get install ttf-mscorefonts-installer | |
104 | - | |
105 | -# Ambas plataformas: | |
106 | -cd /usr/local/src | |
107 | - | |
108 | -for i in \ | |
109 | - https://github.com/tesseract-ocr/langdata.git \ | |
110 | - https://github.com/DanBloomberg/leptonica.git \ | |
111 | - https://github.com/libav/libav.git \ | |
112 | - https://github.com/tesseract-ocr/tessdata.git \ | |
113 | - https://github.com/tesseract-ocr/tesseract.git \ | |
114 | - git://git.freedesktop.org/git/poppler/poppler.git \ | |
115 | - git://git.freedesktop.org/git/poppler/test.git \ | |
116 | - https://github.com/Flameeyes/unpaper.git \ | |
117 | - https://github.com/ocaml/ocaml.git \ | |
118 | - https://gitlab.camlcity.org/gerd/lib-findlib.git \ | |
119 | - https://github.com/johnwhitington/camlpdf.git \ | |
120 | - https://github.com/johnwhitington/cpdf-source.git \ | |
121 | -; do git clone $i; done | |
122 | - | |
123 | -wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | |
124 | -unzip pdftk-2.02-src.zip | |
125 | -rm -f pdftk-2.02-src.zip | |
126 | - | |
127 | -# pdftk, versão 2.02 ou superior | |
128 | -cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | |
129 | - | |
130 | -# Centos 6.9 | |
131 | -# \_ Cria um novo shell usando o GCC 4.8 por default | |
132 | -scl enable devtoolset-2 bash | |
133 | - | |
134 | -# Tesseract, versão 3.05-dev ou superior | |
135 | -# Bibliotecas para o Tesseract: Leptonica e Libav | |
136 | -cd leptonica && ./autobuild && ./configure && make all install && cd .. | |
137 | - | |
138 | -# Para compilação do Tesseract após a compilação do leptonica | |
139 | -export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | |
140 | - | |
141 | -cd libav && ./configure --enable-sram && make all install && cd .. | |
142 | - | |
143 | -# Tesseract | |
144 | -cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | |
145 | -cp -avR tessdata/* /usr/local/share/tessdata/ | |
146 | - | |
147 | -# cpdf, versão 2.1 ou superior | |
148 | -cd ocaml && ./configure && make world.opt && make install && cd .. | |
149 | -mkdir -p /usr/local/man/man5 | |
150 | -# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | |
151 | -cd lib-findlib && ./configure && make all && make install && cd .. | |
152 | -cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | |
153 | -cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | |
154 | - | |
155 | -# poppler-utils, versão 0.42.0 ou superior | |
156 | -cd poppler && ./autogen.sh && ./configure && make all install && cd .. | |
157 | - | |
158 | -# Centos 6.9 | |
159 | -# \_ Termina o shell usando o GCC 4.8 por default | |
160 | -exit | |
161 | - | |
162 | -# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root) | |
163 | - | |
164 | -## Comandos adicionais para configuração do módulo: | |
165 | - | |
166 | -# Criação do usuário | |
167 | -adduser ocr | |
168 | - | |
169 | -# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | |
170 | -cp ./usr/local/bin/ocr /usr/local/bin | |
171 | - | |
172 | -# Auto start (RedHat 6.7 e CentOs 6.9) | |
173 | -cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | |
174 | -mv /etc | |
175 | -chkconfig --add ocr | |
176 | -chkconfig --level 2345 ocr on | |
177 | - | |
178 | -# Auto start (Ubuntu 14.04) | |
179 | -cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | |
180 | -update-rd.d ocr defaults | |
181 | - | |
182 | -# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | |
183 | -cd /home/ocr | |
184 | -tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | |
185 | -su | |
186 | - | |
187 | -# Copie o pacote para os outros servidores e extraia com: | |
188 | -cd / | |
189 | -tar xovzf pkg-ocr.tgz | |
190 | - | |
191 | -# Instalando pré-requisitos RUNTIME em servidores adicionais | |
192 | - | |
193 | -# Redhat 6.7 e CentOS 6.9 | |
194 | -yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp | |
195 | -yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | |
196 | - | |
197 | -# Ubuntu 14.04 | |
198 | -apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | |
199 | -apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 | |
200 | - | |
201 | -# Inicie o serviço com | |
202 | -service ocr start |
... | ... | @@ -0,0 +1,260 @@ |
1 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees | |
2 | + | |
3 | +This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer | |
4 | + | |
5 | +It may be distributed under the conditions of the LGPL v2.1 license. | |
6 | + | |
7 | +Author: Guilherme Chehab | |
8 | + | |
9 | +## Version History: | |
10 | + - 0.1 | |
11 | + - Initial single server version | |
12 | + - 0.2 | |
13 | + - Check if page already has the html hidden layer, if so, ignore it | |
14 | + - 0.3 | |
15 | + - Solved issues about various image enconding types | |
16 | + - 0.4 | |
17 | + - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files | |
18 | + - 0.5 | |
19 | + - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times. | |
20 | + - 0.6 | |
21 | + - Added a default handler for unknown image encoding using jpeg encoding | |
22 | + - 0.7 | |
23 | + - Solved an issue with files with more than 1000 pages | |
24 | + - 1.0 | |
25 | + - First release version | |
26 | + - 1.0.1 Solving error when file has no images | |
27 | + - 1.0.2 Fix bug when counting cores for AMD processors | |
28 | + - 1.0.3 Added better image type detection | |
29 | + - 1.0.4 Fix: added ubuntu init script | |
30 | + - 1.0.4b Add Centos 6.9 install instructions | |
31 | + - 2.0 | |
32 | + - PDF/A output, and better compression with ghostscript | |
33 | + - Rewritten image extration, processing and transformations process | |
34 | + - Check if input file is signed, in this case, does not change the file contents | |
35 | + - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | |
36 | + - Use operating system packges by default | |
37 | + - Changed paths from external programs, instead of using full paths, uses first match from $PATH | |
38 | + - Check existence of external programs on path before running | |
39 | + - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | |
40 | + - Fix: create subpaths on error folder | |
41 | + - Fix: trying to reduce overhead on temporary folder | |
42 | + | |
43 | +## TODO: | |
44 | + - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | |
45 | + - Review poppler and cpdf install instructions | |
46 | + - Add better handling of vectorized and non scanned pdf files | |
47 | + - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers | |
48 | + - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W. | |
49 | + - Move all parameters to config file | |
50 | + - Add some job control web interface | |
51 | + - Add end user interface to submit files through web | |
52 | + - Add check external programs version requirements before running | |
53 | + | |
54 | +## BUGS: | |
55 | + - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages | |
56 | + | |
57 | +## Requirements: | |
58 | + - Perl 5.10.1, com seguintes módulos: | |
59 | + - File::Find::Rule | |
60 | + - File::Basename | |
61 | + - File::Copy | |
62 | + - File::Path | |
63 | + - File::Touch | |
64 | + - Sys::Syslog | |
65 | + - Sys::Hostname | |
66 | + - IPC::Open3 | |
67 | + - IO::Select | |
68 | + - POSIX | |
69 | + - Tesseract-ocr 3.05, com dicionários inglês e português | |
70 | + - Pdftk 2.02 | |
71 | + - Poppler-utils 0.42.0 | |
72 | + - Cpdf 2.1 | |
73 | + - ImageMagick 6.7.2-7 | |
74 | + - Ghostcript 9.18 | |
75 | + | |
76 | +Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | |
77 | + | |
78 | +Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | |
79 | + | |
80 | +Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | |
81 | + | |
82 | +ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | |
83 | + | |
84 | +### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | |
85 | + | |
86 | +- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | |
87 | +- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | |
88 | +- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | |
89 | +- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | |
90 | + | |
91 | +Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | |
92 | + | |
93 | +Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | |
94 | + | |
95 | +A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | |
96 | + | |
97 | +Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | |
98 | + | |
99 | + | |
100 | +# Container Docker | |
101 | + | |
102 | + O OCR-Server também está disponível como um container Docker, permitindo o rápido provisionamento da solução em ambiente de produção. Todos os procedimento para construção da imagem do container podem ser encontrados no arquivo Dockerfile. | |
103 | + | |
104 | + Para execução do serviço, basta que o docker instalado no servidor e executar o seguinte comando: | |
105 | + | |
106 | + docker run --name <NOME_CONTAINER> -d -v <DIRETORIO_BASE>:/var/ocr-server guilhermeadc/ocr-server | |
107 | + | |
108 | + Onde: | |
109 | + --name : Nome atribuído à instância do container. Ex: ocr-server | |
110 | + -d : Indicação executar o container em background | |
111 | + -v : Diretório de compartilhamento entre o servidor host e o container. | |
112 | + O parâmetro <DIRETORIO_BASE> deve ser substituído pelo diretório base para busca de arquivos. | |
113 | + | |
114 | + Para vistualizar os logs de processamento do serviço, basta executar o seguinte comando: | |
115 | + docker logs <NOME_CONTAINER> | |
116 | + | |
117 | + | |
118 | +# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root) | |
119 | + | |
120 | +Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial) | |
121 | +são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada | |
122 | + | |
123 | +Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries | |
124 | + | |
125 | +## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | |
126 | + | |
127 | + # RedHat 6.7 e Centos 6.9: | |
128 | + yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | |
129 | + yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel | |
130 | + cd /tmp | |
131 | + wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | |
132 | + rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | |
133 | + rm -f msttcore-fonts-2.0-3.noarch.rpm | |
134 | + | |
135 | + # Centos 6.9 | |
136 | + # \_ autoconf-archive | |
137 | + wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | |
138 | + rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | |
139 | + rm autoconf-archive-2012.04.07-7.3.noarch.rpm | |
140 | + # \_ GCC 4.8 | |
141 | + wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | |
142 | + yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | |
143 | + | |
144 | + # Ubuntu 14.04 Server: | |
145 | + apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | |
146 | + apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev | |
147 | + apt-get install ttf-mscorefonts-installer | |
148 | + | |
149 | + # Ambas plataformas: | |
150 | + cd /usr/local/src | |
151 | + | |
152 | + for i in \ | |
153 | + https://github.com/tesseract-ocr/langdata.git \ | |
154 | + https://github.com/DanBloomberg/leptonica.git \ | |
155 | + https://github.com/libav/libav.git \ | |
156 | + https://github.com/tesseract-ocr/tessdata.git \ | |
157 | + https://github.com/tesseract-ocr/tesseract.git \ | |
158 | + git://git.freedesktop.org/git/poppler/poppler.git \ | |
159 | + git://git.freedesktop.org/git/poppler/test.git \ | |
160 | + https://github.com/Flameeyes/unpaper.git \ | |
161 | + https://github.com/ocaml/ocaml.git \ | |
162 | + https://gitlab.camlcity.org/gerd/lib-findlib.git \ | |
163 | + https://github.com/johnwhitington/camlpdf.git \ | |
164 | + https://github.com/johnwhitington/cpdf-source.git \ | |
165 | + http://git.ghostscript.com/ghostpdl.git \ | |
166 | + ; do git clone $i; done | |
167 | + | |
168 | + wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | |
169 | + unzip pdftk-2.02-src.zip | |
170 | + rm -f pdftk-2.02-src.zip | |
171 | + | |
172 | + # pdftk, versão 2.02 ou superior | |
173 | + cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | |
174 | + | |
175 | + # Ghostscript 9.18 ou superior | |
176 | + #wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.21.tar.gz | |
177 | + #tar xvozf ghostscript-9.21.tar.gz | |
178 | + #rm -f ghostscript-9.21.tar.gz | |
179 | + #cd ghostscript-9.21 | |
180 | + cd ghostpdl | |
181 | + ./autogen.sh; ./configure | |
182 | + make all install | |
183 | + cd .. | |
184 | + | |
185 | + # Centos 6.9 | |
186 | + # \_ Cria um novo shell usando o GCC 4.8 por default | |
187 | + scl enable devtoolset-2 bash | |
188 | + | |
189 | + # Tesseract, versão 3.05-dev ou superior | |
190 | + # Bibliotecas para o Tesseract: Leptonica e Libav | |
191 | + cd leptonica && ./autobuild && ./configure && make all install && cd .. | |
192 | + | |
193 | + # Para compilação do Tesseract após a compilação do leptonica | |
194 | + export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | |
195 | + | |
196 | + cd libav && ./configure --enable-sram && make all install && cd .. | |
197 | + | |
198 | + # Tesseract | |
199 | + cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | |
200 | + cp -avR tessdata/* /usr/local/share/tessdata/ | |
201 | + | |
202 | + # cpdf, versão 2.1 ou superior | |
203 | + cd ocaml && ./configure && make world.opt && make install && cd .. | |
204 | + mkdir -p /usr/local/man/man5 | |
205 | + # lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | |
206 | + cd lib-findlib && ./configure && make all && make install && cd .. | |
207 | + cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | |
208 | + cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | |
209 | + | |
210 | + # poppler-utils, versão 0.42.0 ou superior | |
211 | + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd .. | |
212 | + | |
213 | + # Centos 6.9 | |
214 | + # \_ Termina o shell usando o GCC 4.8 por default | |
215 | + exit | |
216 | + | |
217 | + | |
218 | +## Comandos adicionais para configuração do módulo: | |
219 | + | |
220 | + # Criação do usuário | |
221 | + adduser ocr | |
222 | + | |
223 | + # Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | |
224 | + cp ./usr/local/bin/ocr /usr/local/bin | |
225 | + | |
226 | + # Auto start (RedHat 6.7 e CentOs 6.9) | |
227 | + cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | |
228 | + mv /etc | |
229 | + chkconfig --add ocr | |
230 | + chkconfig --level 2345 ocr on | |
231 | + | |
232 | + # Auto start (Ubuntu 14.04) | |
233 | + cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | |
234 | + update-rd.d ocr defaults | |
235 | + | |
236 | + # Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | |
237 | + cd /home/ocr | |
238 | + tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | |
239 | + su | |
240 | + | |
241 | +# INSTALAÇÃO (obs.: os comandos devem ser executados como root) | |
242 | + # Criação do usuário | |
243 | + adduser ocr | |
244 | + | |
245 | + # Copie o pacote para os outros servidores e extraia com: | |
246 | + cd / | |
247 | + tar xovzf pkg-ocr.tgz | |
248 | + | |
249 | + # Instalando pré-requisitos RUNTIME em servidores adicionais | |
250 | + | |
251 | + # Redhat 6.7 e CentOS 6.9 | |
252 | + yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript | |
253 | + yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | |
254 | + | |
255 | + # Ubuntu 14.04 | |
256 | + apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | |
257 | + apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript | |
258 | + | |
259 | +# Inicie o serviço com | |
260 | + service ocr start | ... | ... |
... | ... | @@ -0,0 +1,17 @@ |
1 | +#!/usr/bin/env bash | |
2 | + | |
3 | +# Inicializa serviço de log | |
4 | +/etc/init.d/rsyslog start | |
5 | + | |
6 | +# Cria estrutura de pastas para monitoramento de arquivos | |
7 | +mkdir -p /var/ocr-server/ | |
8 | +mkdir -p /var/ocr-server/Entrada | |
9 | +mkdir -p /var/ocr-server/Saida | |
10 | +mkdir -p /var/ocr-server/Originais_Processados | |
11 | +mkdir -p /var/ocr-server/Erro | |
12 | +chmod -R 777 /var/ocr-server | |
13 | + | |
14 | +# Iniciar serviço do OCR-Server | |
15 | +service ocr start | |
16 | + | |
17 | +tail -f /var/log/syslog | |
0 | 18 | \ No newline at end of file | ... | ... |
usr/local/bin/ocr
1 | -#! /usr/bin/perl -w | |
1 | +#!/usr/bin/perl -w | |
2 | 2 | # |
3 | -# OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes | |
3 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes | |
4 | 4 | # |
5 | 5 | # This script monitors a set of input directories for PDF files |
6 | 6 | # once a new file is detected, it is processes through tesseract OCR |
... | ... | @@ -24,15 +24,38 @@ |
24 | 24 | # 1.0.1 Solving error when file has no images |
25 | 25 | # 1.0.2 Fix bug when counting cores for AMD processors |
26 | 26 | # 1.0.3 Added better image type detection |
27 | -# 1.0.4 Fix: added ubuntu init script | |
27 | +# 1.0.4 Fix: added ubuntu init script | |
28 | +# 1.0.4b Add Centos 6.9 install instructions | |
29 | +# 2.0 PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is | |
30 | +# strongly recomended | |
31 | +# Rewritten image extration, processing and transformations process | |
32 | +# Check if input file is signed, in this case, does not change the file contents | |
33 | +# Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | |
34 | +# Use operating system packges by default | |
35 | +# Changed paths from external programs, instead of using full paths, uses first match from $PATH | |
36 | +# Check existence of external programs on path before running | |
37 | +# Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | |
38 | +# Fix: create subpaths on error folder | |
39 | +# Fix: trying to reduce overhead on temporary folder | |
28 | 40 | # |
29 | 41 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
30 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
31 | -# diferently but does not treat it adequately | |
42 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them | |
43 | +# diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | |
32 | 44 | # - Review poppler and cpdf install instructions |
33 | 45 | # - Add better handling of vectorized and non scanned pdf files |
34 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
35 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
46 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current | |
47 | +# scalling, cropping and rotation handlers | |
48 | +# - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- | |
49 | +# added function to analyse image color histogram -> just need to add option to convert it to B&W. | |
50 | +# - Move all parameters to config file | |
51 | +# - Add some job control web interface | |
52 | +# - Add end user interface to submit files through web | |
53 | +# - Add check external programs version requirements before running | |
54 | +# | |
55 | +# BUGS: - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than | |
56 | +# original, this is due to using pdftoppm instead of pdfimages | |
57 | +# - Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions | |
58 | +# increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server | |
36 | 59 | # |
37 | 60 | # Check software requirements on the comments bellow |
38 | 61 | # |
... | ... | @@ -55,7 +78,7 @@ use IPC::Open3; |
55 | 78 | use IO::Select; |
56 | 79 | |
57 | 80 | my $DEBUG = 0; |
58 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | |
81 | +my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | |
59 | 82 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
60 | 83 | |
61 | 84 | my $USER = 'ocr'; |
... | ... | @@ -63,39 +86,48 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca |
63 | 86 | |
64 | 87 | # Command dependencies |
65 | 88 | |
66 | -# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher | |
67 | -my $TESSERACT = '/usr/local/bin/tesseract -l por+eng'; | |
89 | +# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended | |
90 | +my $TESSERACT = 'tesseract --oem 0'; # if Tesseract => 4.0 | |
91 | +#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0 | |
68 | 92 | |
69 | 93 | # Depends on pdftk 2.02 or higher |
70 | -my $PDFTK = '/usr/local/bin/pdftk'; | |
94 | +my $PDFTK = 'pdftk'; | |
71 | 95 | |
72 | 96 | # Depends on poppler-utils 0.42.0 or higher |
73 | -#my $PDINFO = '/usr/local/bin/pdfinfo'; | |
74 | -my $PDFFONTS = '/usr/local/bin/pdffonts'; | |
75 | -my $PDFIMAGES = '/usr/local/bin/pdfimages'; | |
76 | -my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | |
97 | +my $PDFFONTS = 'pdffonts'; | |
98 | +my $PDFIMAGES = 'pdfimages'; | |
99 | +my $PDFTOPPM = 'pdftoppm'; | |
100 | +my $PDFUNITE = 'pdfunite'; | |
101 | +my $PDFSIG = 'pdfsig'; | |
77 | 102 | |
78 | 103 | # Depends on cpdf 2.1 or higher |
79 | -my $CPDF = '/usr/local/bin/cpdf'; | |
104 | +my $CPDF = 'cpdf'; | |
105 | + | |
106 | +# Depends on Ghostscript 9.18 | |
107 | +my $GS = 'gs'; | |
80 | 108 | |
81 | 109 | ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner |
82 | -my $CONVERT = '/usr/bin/convert'; | |
110 | +my $CONVERT = 'convert'; | |
83 | 111 | |
84 | 112 | # If it is needed further filtering |
85 | 113 | #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; |
86 | 114 | |
87 | -my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | |
88 | - '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); | |
115 | +#my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | |
116 | +# '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); | |
117 | + | |
118 | +my @BASE_DIRS = ('/var/ocr-server/'); | |
89 | 119 | |
90 | 120 | my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); |
91 | 121 | |
92 | 122 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
93 | -%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); | |
123 | +%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2); | |
94 | 124 | |
95 | 125 | # Safeguard im case of cpuinfo has not identified correctly the number of CPUs |
96 | 126 | $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; |
97 | 127 | |
98 | -$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; | |
128 | +$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin'; | |
129 | +$ENV{'IFS'} = '\t\n'; | |
130 | + | |
99 | 131 | my ($host) = split/\./,hostname; |
100 | 132 | |
101 | 133 | use vars qw/*name *dir *prune/; |
... | ... | @@ -107,14 +139,15 @@ sub main; |
107 | 139 | sub get_pages; |
108 | 140 | sub get_rotation; |
109 | 141 | sub get_res; |
110 | -sub is_ocred; | |
111 | 142 | sub is_locked_ex; |
112 | 143 | |
113 | 144 | |
114 | 145 | my $expr = 'use POSIX qw(setsid)'; |
115 | 146 | |
116 | 147 | my ($dumb1, $dumb2, $uid) = getpwnam ($USER); |
117 | -setuid ($uid) or warn "Cant set uid $uid"; | |
148 | +if (defined $uid) { | |
149 | + setuid ($uid) or warn "Cant set uid $uid"; | |
150 | +} | |
118 | 151 | |
119 | 152 | $SIG{__DIE__} = 'DEFAULT'; |
120 | 153 | $SIG{__WARN__} = \&die_when_called; |
... | ... | @@ -126,6 +159,11 @@ if ($@) { |
126 | 159 | chdir('/') or die "$0: cannot chdir '/': $!\n"; |
127 | 160 | open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; |
128 | 161 | |
162 | +foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) { | |
163 | + die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0); | |
164 | +} | |
165 | + | |
166 | + | |
129 | 167 | foreach my $DIR (@BASE_DIRS) { |
130 | 168 | |
131 | 169 | defined(my $pid = fork) or die "$0: cannot fork: $!\n"; |
... | ... | @@ -135,7 +173,7 @@ foreach my $DIR (@BASE_DIRS) { |
135 | 173 | main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); |
136 | 174 | exit 0; |
137 | 175 | last; |
138 | - } | |
176 | + } | |
139 | 177 | } |
140 | 178 | |
141 | 179 | exit 0; |
... | ... | @@ -157,7 +195,7 @@ sub main { |
157 | 195 | # remove .tmp file |
158 | 196 | unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); |
159 | 197 | |
160 | - # Rename files that were in 'processig' back | |
198 | + # Rename files that were in 'processing' state back | |
161 | 199 | foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { |
162 | 200 | my $old_name = $file; |
163 | 201 | $old_name =~ s/\.${host}\.processing$//g; |
... | ... | @@ -177,12 +215,14 @@ sub main { |
177 | 215 | # Main loop |
178 | 216 | while ( 1 ) { |
179 | 217 | select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced |
218 | + | |
180 | 219 | $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); |
181 | 220 | print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; |
182 | 221 | $count = scalar keys %files_in; |
183 | - foreach my $file (keys %files_in) { | |
184 | 222 | |
185 | - next if ( glob ("$file.*.tmp")); | |
223 | + foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) { | |
224 | + | |
225 | + next if ( glob ("\"$file.*.tmp\"")); | |
186 | 226 | |
187 | 227 | select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds |
188 | 228 | next if (!defined $files_in{$file}); # continue only if it is still valid |
... | ... | @@ -255,7 +295,7 @@ sub ocr { |
255 | 295 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
256 | 296 | unlink ("$in_file.$host.tmp"); |
257 | 297 | move ( "$in_file.$host.processing", $in_file); |
258 | - exit 0; | |
298 | + exit 1; | |
259 | 299 | }; |
260 | 300 | |
261 | 301 | my $out_path = $in_path; |
... | ... | @@ -271,7 +311,7 @@ sub ocr { |
271 | 311 | my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); |
272 | 312 | |
273 | 313 | print "\twritting to $out_file\n" if $DEBUG; |
274 | - | |
314 | + | |
275 | 315 | my $stime = time; |
276 | 316 | my %pids; |
277 | 317 | |
... | ... | @@ -291,8 +331,26 @@ sub ocr { |
291 | 331 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
292 | 332 | unlink ("$in_file.$host.tmp"); |
293 | 333 | move ( "$in_file.$host.processing", $in_file); |
334 | + print "Error: cannot copy $in_file to temp dir \n" if $DEBUG; | |
335 | + syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG; | |
336 | + exit 1; | |
294 | 337 | }; |
295 | 338 | |
339 | + # Check if file was signed | |
340 | + if (get_sign($tmp_file)) { | |
341 | + if (!copy ("$in_file.$host.processing", $proc_file)) { | |
342 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | |
343 | + unlink ("$in_file.$host.tmp"); | |
344 | + move ( "$in_file.$host.processing", $in_file); | |
345 | + }; | |
346 | + move ("$in_file.$host.processing", $out_file); | |
347 | + unlink ("$in_file.$host.tmp"); | |
348 | + print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG; | |
349 | + syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG; | |
350 | + | |
351 | + exit 0; | |
352 | + } | |
353 | + | |
296 | 354 | # Extract pages |
297 | 355 | ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); |
298 | 356 | if ($DEBUG) { |
... | ... | @@ -301,12 +359,13 @@ sub ocr { |
301 | 359 | print "\t\t\t$_" for @err ; |
302 | 360 | }; |
303 | 361 | |
362 | + my ($pages, @pg_w, @pg_h, @pg_r, @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2); | |
363 | + $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2); | |
304 | 364 | |
305 | - my ($pages, @pg_w, @pg_h, @pg_r); | |
306 | - $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r); | |
365 | + my ($imgs,@page_img, @img_w, @img_h, @img_t, @img_xppi, @img_yppi); | |
366 | + $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi); | |
307 | 367 | |
308 | - my ($imgs,@page_img, @img_w, @img_h, @img_t); | |
309 | - $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t); | |
368 | + unlink ($tmp_file) if (!$DEBUG); | |
310 | 369 | |
311 | 370 | for ( my $i=0; $i< $pages; $i++ ) { |
312 | 371 | my $pg = sprintf ("pg_%06d", $i+1); |
... | ... | @@ -333,25 +392,29 @@ sub ocr { |
333 | 392 | if (! defined $img_t[$i] ) { |
334 | 393 | move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); |
335 | 394 | print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; |
336 | - exit 0; | |
395 | + exit -1; | |
337 | 396 | } |
338 | 397 | |
339 | - print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG; | |
398 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG; | |
399 | + print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG); | |
400 | + print "\n" if ($DEBUG); | |
340 | 401 | |
402 | + # Extract images from page, since 2.0 uses png lossless format regardless of original format or depth | |
341 | 403 | undef $cmd; |
342 | 404 | |
343 | - if ($img_t[$i] eq "gray") { | |
344 | - $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
405 | + # Use PDFIMAGES and JPEG by default | |
406 | + $cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
407 | + | |
408 | + if ($img_t[$i] eq "stencil") { | |
409 | + $cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
345 | 410 | } |
346 | 411 | |
347 | - if ($img_t[$i] eq "rgb") { | |
348 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
349 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | |
412 | + if ($img_t[$i] eq "gray") { | |
413 | + $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
350 | 414 | } |
351 | 415 | |
352 | - if (!defined $cmd) { | |
353 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
354 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | |
416 | + if ($img_t[$i] !~ /gray|rgb|stencil/) { | |
417 | + $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
355 | 418 | } |
356 | 419 | |
357 | 420 | ($exit,$cmd,@out,@err) = exec_cmd($cmd); |
... | ... | @@ -362,7 +425,13 @@ sub ocr { |
362 | 425 | }; |
363 | 426 | |
364 | 427 | # Process each resulting image for page pdf |
365 | - my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ; | |
428 | + my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ; | |
429 | + | |
430 | + if (scalar @images == 0) { | |
431 | + move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); | |
432 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG; | |
433 | + exit 0; | |
434 | + } | |
366 | 435 | |
367 | 436 | foreach my $image (@images) { |
368 | 437 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
... | ... | @@ -378,43 +447,65 @@ sub ocr { |
378 | 447 | print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; |
379 | 448 | } |
380 | 449 | } |
381 | - | |
382 | - # Check if page was rotated | |
383 | - if ($pg_r[$i]) { | |
384 | - print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG; | |
385 | - ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\""); | |
450 | + | |
451 | + # Check if page was rotated and extracted with pdftoppm | |
452 | + if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) { | |
453 | + print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG; | |
454 | + ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\""); | |
386 | 455 | if ($DEBUG) { |
387 | 456 | print "\t\t\t${image} -> $cmd: $exit\n"; |
388 | 457 | print "\t\t\t\t$_" for @out ; |
389 | 458 | print "\t\t\t\t$_" for @err ; |
390 | 459 | }; |
391 | 460 | } |
392 | - | |
461 | + | |
393 | 462 | # Filter ppm images, if needed |
394 | 463 | |
395 | 464 | # OCR ppm images to pdf pages |
396 | - ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf"); | |
465 | + ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf"); | |
397 | 466 | if ($DEBUG) { |
398 | 467 | print "\t\t\t${image} -> $cmd: $exit\n"; |
399 | 468 | print "\t\t\t\t$_" for @out ; |
400 | 469 | print "\t\t\t\t$_" for @err ; |
401 | 470 | }; |
471 | + unlink ("$image") if (!$DEBUG); | |
402 | 472 | |
403 | - # Scale to fit pdf | |
404 | - ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | |
473 | + # Scale, crop and rotate to fit pdf | |
474 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | |
405 | 475 | if ($DEBUG) { |
406 | 476 | print "\t\t\t${image} -> $cmd: $exit\n"; |
407 | 477 | print "\t\t\t\t$_" for @out ; |
408 | 478 | print "\t\t\t\t$_" for @err ; |
409 | 479 | }; |
480 | + unlink ("$image.pdf") if (!$DEBUG); | |
410 | 481 | |
482 | + if (defined $pg_crop_x1[$i]) { | |
483 | + # adjust cropbox | |
484 | + ($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = ( | |
485 | + ($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]), | |
486 | + ($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]), | |
487 | + abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i]) | |
488 | + ); | |
489 | + | |
490 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | |
491 | + if ($DEBUG) { | |
492 | + print "\t\t\t${image} -> $cmd: $exit\n"; | |
493 | + print "\t\t\t\t$_" for @out ; | |
494 | + print "\t\t\t\t$_" for @err ; | |
495 | + }; | |
496 | + } | |
497 | + | |
498 | + if ($pg_r[$i]) { | |
499 | + ($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | |
500 | + if ($DEBUG) { | |
501 | + print "\t\t\t${image} -> $cmd: $exit\n"; | |
502 | + print "\t\t\t\t$_" for @out ; | |
503 | + print "\t\t\t\t$_" for @err ; | |
504 | + }; | |
505 | + } | |
411 | 506 | |
412 | - unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG); | |
413 | - unlink ("$image.pdf") if (!$DEBUG); | |
414 | - move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG); | |
415 | - unlink ("$image") if (!$DEBUG); | |
416 | 507 | } |
417 | - exit 0; | |
508 | + exit 1; | |
418 | 509 | } |
419 | 510 | } |
420 | 511 | |
... | ... | @@ -427,28 +518,51 @@ sub ocr { |
427 | 518 | |
428 | 519 | if (scalar @new_pages != $pages) { |
429 | 520 | print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); |
430 | - syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG); | |
521 | + syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG); | |
431 | 522 | unlink "$in_file.$host.tmp"; |
523 | + make_path ($error_path) if ( ! -d $error_path); | |
432 | 524 | move ("$in_file.$host.processing", $error_file); |
433 | - exit (0); | |
525 | + exit (1); | |
434 | 526 | } |
435 | 527 | |
436 | - # Merge resulting pdf pages to a single pdf | |
528 | + # Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output | |
437 | 529 | make_path ($out_path) if ( ! -d $out_path); |
438 | 530 | unlink $out_file if ( -f $out_file ); |
439 | - ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress"); | |
531 | + | |
532 | + chdir (${tmpdir}); | |
533 | + ($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\" pg_*-cpdf.pdf "); | |
440 | 534 | if ($DEBUG) { |
441 | 535 | print "\t\t${out_file} -> $cmd: $exit\n"; |
442 | 536 | print "\t\t\t$_" for @out ; |
443 | 537 | print "\t\t\t$_" for @err ; |
444 | 538 | }; |
539 | + if ($exit) { | |
540 | + unlink "$in_file.$host.tmp"; | |
541 | + unlink $out_file; | |
542 | + make_path ($error_path) if ( ! -d $error_path); | |
543 | + move ("$in_file.$host.processing", $error_file); | |
544 | + print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); | |
545 | + syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG); | |
546 | + exit (1); | |
547 | + } | |
548 | + chdir ("/"); | |
549 | + | |
550 | + if (!copy (${tmp_file}, $out_file)) { | |
551 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | |
552 | + unlink ("$in_file.$host.tmp"); | |
553 | + unlink $out_file; | |
554 | + make_path ($error_path) if ( ! -d $error_path); | |
555 | + move ("$in_file.$host.processing", $error_file); | |
556 | + print "Error: cannot copy temp file to $out_file \n" if $DEBUG; | |
557 | + syslog ("error","cannot copy temp file to $out_file") if !$DEBUG; | |
558 | + exit 1; | |
559 | + }; | |
445 | 560 | |
446 | 561 | make_path ($proc_path) if ( ! -d $proc_path); |
447 | 562 | unlink $proc_file if ( -f $proc_file ); |
448 | 563 | move ("$in_file.$host.processing", $proc_file); |
449 | 564 | move ("${out_file}.tmp", ${out_file}); |
450 | 565 | |
451 | - | |
452 | 566 | # Remove temp dir |
453 | 567 | remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); |
454 | 568 | unlink $tmp_file if (!$DEBUG); |
... | ... | @@ -471,7 +585,7 @@ sub is_ocred { |
471 | 585 | } |
472 | 586 | |
473 | 587 | sub get_pages { |
474 | - my ($in_file, $w, $h, $r) = @_; | |
588 | + my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_; | |
475 | 589 | |
476 | 590 | my $pages=0; |
477 | 591 | my $i=0; |
... | ... | @@ -485,29 +599,35 @@ sub get_pages { |
485 | 599 | ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); |
486 | 600 | ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); |
487 | 601 | ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); |
488 | - ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); | |
602 | + ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); | |
603 | + ($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ ); | |
489 | 604 | } |
490 | 605 | |
491 | 606 | return $pages; |
492 | 607 | } |
493 | 608 | |
494 | 609 | sub get_imgs { |
495 | - my ($in_file, $page_img, $w, $h, $t) = @_; | |
496 | - my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); | |
610 | + my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_; | |
611 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi ); | |
497 | 612 | |
498 | 613 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
614 | + $i = 0; | |
499 | 615 | |
500 | 616 | foreach my $line (@lines) { |
501 | 617 | chomp $line; |
502 | 618 | $line =~ s/^ {1,}//; |
503 | - if ( $line =~ /image|mask/ ) { | |
504 | - ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; | |
619 | + if ( $line !~ /^page|^----/ ) { | |
620 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line; | |
505 | 621 | @$page_img[$page-1]=$i; |
506 | 622 | @$w[$page-1] = $width; |
507 | 623 | @$h[$page-1] = $height; |
508 | 624 | @$t[$page-1] = "rgb"; # Default is color |
509 | - @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | |
510 | 625 | @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); |
626 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | |
627 | + @$t[$page-1] = ( $type eq "stencil" ? $type : @$t[$page-1]); | |
628 | + @$t[$page-1] = ( $enc eq "image" ? $enc : @$t[$page-1]); | |
629 | + @$x_ppi[$page-1] = $xppi; | |
630 | + @$y_ppi[$page-1] = $yppi; | |
511 | 631 | } |
512 | 632 | } |
513 | 633 | return $i+1; |
... | ... | @@ -542,6 +662,19 @@ sub get_res { |
542 | 662 | return ($res_x,$res_y); |
543 | 663 | } |
544 | 664 | |
665 | +sub get_sign { | |
666 | + my ($in_file) = @_; | |
667 | + my @lines = `${PDFSIG} \"${in_file}\" 2>/dev/null`; | |
668 | + | |
669 | + foreach (@lines) { | |
670 | + chomp; | |
671 | + if ( $_ =~ /^Signature/ ) { | |
672 | + return 1; | |
673 | + } | |
674 | + } | |
675 | + return 0; | |
676 | +} | |
677 | + | |
545 | 678 | sub is_locked_ex { |
546 | 679 | my ($path) = @_; |
547 | 680 | ... | ... |
workflow.pdf
No preview for this file type
workflow.vsd
No preview for this file type