Commit f02dd8e43c99cc7ed8312d8ace23cc29f208aebb
Exists in
master
Merge branch 'Pre_versao_2.0' into 'master'
Pre versao 2.0 final para gerar Tag 2.0 Final See merge request !4
Showing
7 changed files
with
583 additions
and
272 deletions
Show diff stats
@@ -0,0 +1,103 @@ | @@ -0,0 +1,103 @@ | ||
1 | + | ||
2 | +FROM ubuntu:14.04 | ||
3 | + | ||
4 | +# Cópia de arquivos do projeto OCR-SERVER | ||
5 | +COPY usr/local/bin/ocr /usr/local/bin/ocr | ||
6 | +COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr | ||
7 | +COPY entrypoint.sh /entrypoint.sh | ||
8 | + | ||
9 | +WORKDIR /tmp | ||
10 | + | ||
11 | +# Instalação dos pacotes pré-requisitos do ocr-server 2 | ||
12 | +RUN apt-get -y update && \ | ||
13 | + apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \ | ||
14 | + curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \ | ||
15 | + zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \ | ||
16 | + wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \ | ||
17 | + libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \ | ||
18 | + libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \ | ||
19 | + rsyslog libsys-syslog-perl && \ | ||
20 | + apt-get -y clean all | ||
21 | + | ||
22 | +RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \ | ||
23 | + dpkg -i mscorefonts.deb && \ | ||
24 | + rm mscorefonts.deb | ||
25 | + | ||
26 | +# Instalação do Perl 5.1 e demais módulos | ||
27 | +RUN perl -MCPAN -e 'install File::Touch' | ||
28 | +RUN perl -MCPAN -e 'install File::Find::Rule;' | ||
29 | +RUN perl -MCPAN -e 'install File::Touch;' | ||
30 | +RUN perl -MCPAN -e 'install Sys::Syslog;' | ||
31 | +RUN perl -MCPAN -e 'install IPC::Open3;' | ||
32 | +RUN perl -MCPAN -e 'install IO::Select;' | ||
33 | + | ||
34 | +# Tesseract-ocr 3.05, com dicionários inglês e português | ||
35 | +# Bibliotecas para o Tesseract: Leptonica | ||
36 | +RUN git clone https://github.com/DanBloomberg/leptonica.git && \ | ||
37 | + cd leptonica && ./autobuild && ./configure && make all install && \ | ||
38 | + rm -rf ../leptonica | ||
39 | + | ||
40 | +# Bibliotecas para o Tesseract: Libav | ||
41 | +RUN git clone https://github.com/libav/libav.git && \ | ||
42 | + export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \ | ||
43 | + cd libav && ./configure --enable-sram && make all install && \ | ||
44 | + rm -rf ../libav | ||
45 | + | ||
46 | +# Tesseract 3.05.01 | ||
47 | +RUN git clone https://github.com/tesseract-ocr/tesseract.git && \ | ||
48 | + cd tesseract && ./autogen.sh && ./configure && make all install && \ | ||
49 | + rm -rf ../tesseract | ||
50 | + | ||
51 | +RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \ | ||
52 | + wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \ | ||
53 | + wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata | ||
54 | + | ||
55 | +# Poppler 0.56 | ||
56 | +RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \ | ||
57 | + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && \ | ||
58 | + rm -rf ../poppler | ||
59 | + | ||
60 | +# pdftk, versão 2.02 ou superior | ||
61 | +RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \ | ||
62 | + unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \ | ||
63 | + cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \ | ||
64 | + rm -rf ../pdftk-2.02-dist | ||
65 | + | ||
66 | +# Ghostscript 9.18 ou superior | ||
67 | +RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \ | ||
68 | + tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \ | ||
69 | + cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \ | ||
70 | + rm -rf ../ghostscript-9.18 | ||
71 | + | ||
72 | +# CPDF Intel OS X v 2.2 | ||
73 | +RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \ | ||
74 | + cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin | ||
75 | + | ||
76 | +# Atualização das configurações do ld | ||
77 | +RUN ldconfig | ||
78 | + | ||
79 | +RUN useradd -m ocr | ||
80 | + | ||
81 | +RUN chmod +x /usr/local/bin/ocr && \ | ||
82 | + chmod +x /etc/init.d/ocr && \ | ||
83 | + update-rc.d ocr defaults | ||
84 | + | ||
85 | +RUN mkdir /var/ocr-server/ && \ | ||
86 | + mkdir -p /var/ocr-server/Entrada && \ | ||
87 | + mkdir -p /var/ocr-server/Saida && \ | ||
88 | + mkdir -p /var/ocr-server/Originais_Processados && \ | ||
89 | + mkdir -p /var/ocr-server/Erro && \ | ||
90 | + chmod +x /entrypoint.sh | ||
91 | + | ||
92 | +RUN mkdir -p /tmp/ocr_dev/ && \ | ||
93 | + mkdir -p /tmp/ocr_dev/Entrada && \ | ||
94 | + mkdir -p /tmp/ocr_dev/Saida && \ | ||
95 | + mkdir -p /tmp/ocr_dev/Originais_Processados && \ | ||
96 | + mkdir -p /tmp/ocr_dev/Erro && \ | ||
97 | + chmod -R 777 /tmp/ocr_dev | ||
98 | + | ||
99 | +WORKDIR / | ||
100 | + | ||
101 | +VOLUME /var/ocr-server/ | ||
102 | + | ||
103 | +CMD ["bash", "/entrypoint.sh"] | ||
0 | \ No newline at end of file | 104 | \ No newline at end of file |
INSTALL.txt
@@ -1,202 +0,0 @@ | @@ -1,202 +0,0 @@ | ||
1 | -# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees | ||
2 | -# | ||
3 | -# This script monitors a set of input directories for PDF files | ||
4 | -# once a new file is detected, it is processes through tesseract OCR | ||
5 | -# in order to generate a new file with a hidden searchable text layer | ||
6 | -# | ||
7 | -# It may be distributed under the conditions of the LGPL v2.1 license. | ||
8 | -# | ||
9 | -# Author: Guilherme Chehab | ||
10 | -# | ||
11 | -# Version History: | ||
12 | -# 0.1 Initial single server version | ||
13 | -# 0.2 Check if page already has the html hidden layer, if so, ignore it | ||
14 | -# 0.3 Solved issues about various image enconding types | ||
15 | -# 0.4 Added a postnormalization step to ensure all output pdf pages have | ||
16 | -# the same size and orientations as the original files | ||
17 | -# 0.5 Used input file renaming as a way to sync multiple parallel instances, | ||
18 | -# that way, it is minimized the risk of same file being OCRed multiple times. | ||
19 | -# 0.6 Added a default handler for unknown image encoding using jpeg encoding | ||
20 | -# 0.7 Solved an issue with files with more than 1000 pages | ||
21 | -# 1.0 First release version | ||
22 | -# 1.0.1 Solving error when file has no images | ||
23 | -# 1.0.2 Fix bug when counting cores for AMD processors | ||
24 | -# 1.0.3 Added better image type detection | ||
25 | -# 1.0.4 Fix: added ubuntu init script | ||
26 | -# 1.0.4b Centos 6.9 | ||
27 | -# | ||
28 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | ||
29 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | ||
30 | -# diferently but does not treat it adequately | ||
31 | -# - Review poppler and cpdf install instructions | ||
32 | -# - Add better handling of vectorized and non scanned pdf files | ||
33 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | ||
34 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | ||
35 | -# | ||
36 | -# Check software requirements on the comments bellow | ||
37 | -# | ||
38 | -# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables | ||
39 | -# | ||
40 | -# | ||
41 | -# O servidor OCR depende dos seguintes componentes: | ||
42 | -# - Perl 5.10.1, com seguintes módulos: | ||
43 | -# - File::Find::Rule | ||
44 | -# - File::Basename | ||
45 | -# - File::Copy | ||
46 | -# - File::Path | ||
47 | -# - File::Touch | ||
48 | -# - Sys::Syslog | ||
49 | -# - Sys::Hostname | ||
50 | -# - IPC::Open3 | ||
51 | -# - IO::Select | ||
52 | -# - POSIX | ||
53 | -# - Tesseract-ocr 3.05, com dicionários inglês e português | ||
54 | -# - Pdftk 2.02 | ||
55 | -# - Poppler-utils 0.42.0 | ||
56 | -# - Cpdf 2.1 | ||
57 | -# - ImageMagick 6.7.2-7 | ||
58 | -# | ||
59 | -# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | ||
60 | -# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | ||
61 | -# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | ||
62 | -# | ||
63 | -## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | ||
64 | -# | ||
65 | -# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | ||
66 | -# | ||
67 | -# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | ||
68 | -# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | ||
69 | -# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | ||
70 | -# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | ||
71 | -# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | ||
72 | -# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | ||
73 | -# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | ||
74 | -# | ||
75 | -# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | ||
76 | -# | ||
77 | -# | ||
78 | -# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root) | ||
79 | -# | ||
80 | -# | ||
81 | -# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | ||
82 | -# | ||
83 | -# RedHat 6.7 e Centos 6.9: | ||
84 | -yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | ||
85 | -yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel | ||
86 | -cd /tmp | ||
87 | -wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | ||
88 | -rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | ||
89 | -rm -f msttcore-fonts-2.0-3.noarch.rpm | ||
90 | - | ||
91 | -# Centos 6.9 | ||
92 | -# \_ autoconf-archive | ||
93 | -wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
94 | -rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
95 | -rm autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
96 | -# \_ GCC 4.8 | ||
97 | -wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | ||
98 | -yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | ||
99 | - | ||
100 | -# Ubuntu 14.04 Server: | ||
101 | -apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | ||
102 | -apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev | ||
103 | -apt-get install ttf-mscorefonts-installer | ||
104 | - | ||
105 | -# Ambas plataformas: | ||
106 | -cd /usr/local/src | ||
107 | - | ||
108 | -for i in \ | ||
109 | - https://github.com/tesseract-ocr/langdata.git \ | ||
110 | - https://github.com/DanBloomberg/leptonica.git \ | ||
111 | - https://github.com/libav/libav.git \ | ||
112 | - https://github.com/tesseract-ocr/tessdata.git \ | ||
113 | - https://github.com/tesseract-ocr/tesseract.git \ | ||
114 | - git://git.freedesktop.org/git/poppler/poppler.git \ | ||
115 | - git://git.freedesktop.org/git/poppler/test.git \ | ||
116 | - https://github.com/Flameeyes/unpaper.git \ | ||
117 | - https://github.com/ocaml/ocaml.git \ | ||
118 | - https://gitlab.camlcity.org/gerd/lib-findlib.git \ | ||
119 | - https://github.com/johnwhitington/camlpdf.git \ | ||
120 | - https://github.com/johnwhitington/cpdf-source.git \ | ||
121 | -; do git clone $i; done | ||
122 | - | ||
123 | -wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | ||
124 | -unzip pdftk-2.02-src.zip | ||
125 | -rm -f pdftk-2.02-src.zip | ||
126 | - | ||
127 | -# pdftk, versão 2.02 ou superior | ||
128 | -cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | ||
129 | - | ||
130 | -# Centos 6.9 | ||
131 | -# \_ Cria um novo shell usando o GCC 4.8 por default | ||
132 | -scl enable devtoolset-2 bash | ||
133 | - | ||
134 | -# Tesseract, versão 3.05-dev ou superior | ||
135 | -# Bibliotecas para o Tesseract: Leptonica e Libav | ||
136 | -cd leptonica && ./autobuild && ./configure && make all install && cd .. | ||
137 | - | ||
138 | -# Para compilação do Tesseract após a compilação do leptonica | ||
139 | -export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | ||
140 | - | ||
141 | -cd libav && ./configure --enable-sram && make all install && cd .. | ||
142 | - | ||
143 | -# Tesseract | ||
144 | -cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | ||
145 | -cp -avR tessdata/* /usr/local/share/tessdata/ | ||
146 | - | ||
147 | -# cpdf, versão 2.1 ou superior | ||
148 | -cd ocaml && ./configure && make world.opt && make install && cd .. | ||
149 | -mkdir -p /usr/local/man/man5 | ||
150 | -# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | ||
151 | -cd lib-findlib && ./configure && make all && make install && cd .. | ||
152 | -cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | ||
153 | -cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | ||
154 | - | ||
155 | -# poppler-utils, versão 0.42.0 ou superior | ||
156 | -cd poppler && ./autogen.sh && ./configure && make all install && cd .. | ||
157 | - | ||
158 | -# Centos 6.9 | ||
159 | -# \_ Termina o shell usando o GCC 4.8 por default | ||
160 | -exit | ||
161 | - | ||
162 | -# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root) | ||
163 | - | ||
164 | -## Comandos adicionais para configuração do módulo: | ||
165 | - | ||
166 | -# Criação do usuário | ||
167 | -adduser ocr | ||
168 | - | ||
169 | -# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | ||
170 | -cp ./usr/local/bin/ocr /usr/local/bin | ||
171 | - | ||
172 | -# Auto start (RedHat 6.7 e CentOs 6.9) | ||
173 | -cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | ||
174 | -mv /etc | ||
175 | -chkconfig --add ocr | ||
176 | -chkconfig --level 2345 ocr on | ||
177 | - | ||
178 | -# Auto start (Ubuntu 14.04) | ||
179 | -cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | ||
180 | -update-rd.d ocr defaults | ||
181 | - | ||
182 | -# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | ||
183 | -cd /home/ocr | ||
184 | -tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | ||
185 | -su | ||
186 | - | ||
187 | -# Copie o pacote para os outros servidores e extraia com: | ||
188 | -cd / | ||
189 | -tar xovzf pkg-ocr.tgz | ||
190 | - | ||
191 | -# Instalando pré-requisitos RUNTIME em servidores adicionais | ||
192 | - | ||
193 | -# Redhat 6.7 e CentOS 6.9 | ||
194 | -yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp | ||
195 | -yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | ||
196 | - | ||
197 | -# Ubuntu 14.04 | ||
198 | -apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | ||
199 | -apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 | ||
200 | - | ||
201 | -# Inicie o serviço com | ||
202 | -service ocr start |
@@ -0,0 +1,260 @@ | @@ -0,0 +1,260 @@ | ||
1 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees | ||
2 | + | ||
3 | +This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer | ||
4 | + | ||
5 | +It may be distributed under the conditions of the LGPL v2.1 license. | ||
6 | + | ||
7 | +Author: Guilherme Chehab | ||
8 | + | ||
9 | +## Version History: | ||
10 | + - 0.1 | ||
11 | + - Initial single server version | ||
12 | + - 0.2 | ||
13 | + - Check if page already has the html hidden layer, if so, ignore it | ||
14 | + - 0.3 | ||
15 | + - Solved issues about various image enconding types | ||
16 | + - 0.4 | ||
17 | + - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files | ||
18 | + - 0.5 | ||
19 | + - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times. | ||
20 | + - 0.6 | ||
21 | + - Added a default handler for unknown image encoding using jpeg encoding | ||
22 | + - 0.7 | ||
23 | + - Solved an issue with files with more than 1000 pages | ||
24 | + - 1.0 | ||
25 | + - First release version | ||
26 | + - 1.0.1 Solving error when file has no images | ||
27 | + - 1.0.2 Fix bug when counting cores for AMD processors | ||
28 | + - 1.0.3 Added better image type detection | ||
29 | + - 1.0.4 Fix: added ubuntu init script | ||
30 | + - 1.0.4b Add Centos 6.9 install instructions | ||
31 | + - 2.0 | ||
32 | + - PDF/A output, and better compression with ghostscript | ||
33 | + - Rewritten image extration, processing and transformations process | ||
34 | + - Check if input file is signed, in this case, does not change the file contents | ||
35 | + - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | ||
36 | + - Use operating system packges by default | ||
37 | + - Changed paths from external programs, instead of using full paths, uses first match from $PATH | ||
38 | + - Check existence of external programs on path before running | ||
39 | + - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | ||
40 | + - Fix: create subpaths on error folder | ||
41 | + - Fix: trying to reduce overhead on temporary folder | ||
42 | + | ||
43 | +## TODO: | ||
44 | + - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | ||
45 | + - Review poppler and cpdf install instructions | ||
46 | + - Add better handling of vectorized and non scanned pdf files | ||
47 | + - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers | ||
48 | + - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W. | ||
49 | + - Move all parameters to config file | ||
50 | + - Add some job control web interface | ||
51 | + - Add end user interface to submit files through web | ||
52 | + - Add check external programs version requirements before running | ||
53 | + | ||
54 | +## BUGS: | ||
55 | + - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages | ||
56 | + | ||
57 | +## Requirements: | ||
58 | + - Perl 5.10.1, com seguintes módulos: | ||
59 | + - File::Find::Rule | ||
60 | + - File::Basename | ||
61 | + - File::Copy | ||
62 | + - File::Path | ||
63 | + - File::Touch | ||
64 | + - Sys::Syslog | ||
65 | + - Sys::Hostname | ||
66 | + - IPC::Open3 | ||
67 | + - IO::Select | ||
68 | + - POSIX | ||
69 | + - Tesseract-ocr 3.05, com dicionários inglês e português | ||
70 | + - Pdftk 2.02 | ||
71 | + - Poppler-utils 0.42.0 | ||
72 | + - Cpdf 2.1 | ||
73 | + - ImageMagick 6.7.2-7 | ||
74 | + - Ghostcript 9.18 | ||
75 | + | ||
76 | +Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | ||
77 | + | ||
78 | +Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | ||
79 | + | ||
80 | +Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | ||
81 | + | ||
82 | +ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | ||
83 | + | ||
84 | +### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | ||
85 | + | ||
86 | +- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | ||
87 | +- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | ||
88 | +- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | ||
89 | +- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | ||
90 | + | ||
91 | +Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | ||
92 | + | ||
93 | +Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | ||
94 | + | ||
95 | +A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | ||
96 | + | ||
97 | +Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | ||
98 | + | ||
99 | + | ||
100 | +# Container Docker | ||
101 | + | ||
102 | + O OCR-Server também está disponível como um container Docker, permitindo o rápido provisionamento da solução em ambiente de produção. Todos os procedimento para construção da imagem do container podem ser encontrados no arquivo Dockerfile. | ||
103 | + | ||
104 | + Para execução do serviço, basta que o docker instalado no servidor e executar o seguinte comando: | ||
105 | + | ||
106 | + docker run --name <NOME_CONTAINER> -d -v <DIRETORIO_BASE>:/var/ocr-server guilhermeadc/ocr-server | ||
107 | + | ||
108 | + Onde: | ||
109 | + --name : Nome atribuído à instância do container. Ex: ocr-server | ||
110 | + -d : Indicação executar o container em background | ||
111 | + -v : Diretório de compartilhamento entre o servidor host e o container. | ||
112 | + O parâmetro <DIRETORIO_BASE> deve ser substituído pelo diretório base para busca de arquivos. | ||
113 | + | ||
114 | + Para vistualizar os logs de processamento do serviço, basta executar o seguinte comando: | ||
115 | + docker logs <NOME_CONTAINER> | ||
116 | + | ||
117 | + | ||
118 | +# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root) | ||
119 | + | ||
120 | +Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial) | ||
121 | +são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada | ||
122 | + | ||
123 | +Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries | ||
124 | + | ||
125 | +## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | ||
126 | + | ||
127 | + # RedHat 6.7 e Centos 6.9: | ||
128 | + yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | ||
129 | + yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel | ||
130 | + cd /tmp | ||
131 | + wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | ||
132 | + rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | ||
133 | + rm -f msttcore-fonts-2.0-3.noarch.rpm | ||
134 | + | ||
135 | + # Centos 6.9 | ||
136 | + # \_ autoconf-archive | ||
137 | + wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
138 | + rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
139 | + rm autoconf-archive-2012.04.07-7.3.noarch.rpm | ||
140 | + # \_ GCC 4.8 | ||
141 | + wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | ||
142 | + yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | ||
143 | + | ||
144 | + # Ubuntu 14.04 Server: | ||
145 | + apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | ||
146 | + apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev | ||
147 | + apt-get install ttf-mscorefonts-installer | ||
148 | + | ||
149 | + # Ambas plataformas: | ||
150 | + cd /usr/local/src | ||
151 | + | ||
152 | + for i in \ | ||
153 | + https://github.com/tesseract-ocr/langdata.git \ | ||
154 | + https://github.com/DanBloomberg/leptonica.git \ | ||
155 | + https://github.com/libav/libav.git \ | ||
156 | + https://github.com/tesseract-ocr/tessdata.git \ | ||
157 | + https://github.com/tesseract-ocr/tesseract.git \ | ||
158 | + git://git.freedesktop.org/git/poppler/poppler.git \ | ||
159 | + git://git.freedesktop.org/git/poppler/test.git \ | ||
160 | + https://github.com/Flameeyes/unpaper.git \ | ||
161 | + https://github.com/ocaml/ocaml.git \ | ||
162 | + https://gitlab.camlcity.org/gerd/lib-findlib.git \ | ||
163 | + https://github.com/johnwhitington/camlpdf.git \ | ||
164 | + https://github.com/johnwhitington/cpdf-source.git \ | ||
165 | + http://git.ghostscript.com/ghostpdl.git \ | ||
166 | + ; do git clone $i; done | ||
167 | + | ||
168 | + wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | ||
169 | + unzip pdftk-2.02-src.zip | ||
170 | + rm -f pdftk-2.02-src.zip | ||
171 | + | ||
172 | + # pdftk, versão 2.02 ou superior | ||
173 | + cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | ||
174 | + | ||
175 | + # Ghostscript 9.18 ou superior | ||
176 | + #wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.21.tar.gz | ||
177 | + #tar xvozf ghostscript-9.21.tar.gz | ||
178 | + #rm -f ghostscript-9.21.tar.gz | ||
179 | + #cd ghostscript-9.21 | ||
180 | + cd ghostpdl | ||
181 | + ./autogen.sh; ./configure | ||
182 | + make all install | ||
183 | + cd .. | ||
184 | + | ||
185 | + # Centos 6.9 | ||
186 | + # \_ Cria um novo shell usando o GCC 4.8 por default | ||
187 | + scl enable devtoolset-2 bash | ||
188 | + | ||
189 | + # Tesseract, versão 3.05-dev ou superior | ||
190 | + # Bibliotecas para o Tesseract: Leptonica e Libav | ||
191 | + cd leptonica && ./autobuild && ./configure && make all install && cd .. | ||
192 | + | ||
193 | + # Para compilação do Tesseract após a compilação do leptonica | ||
194 | + export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | ||
195 | + | ||
196 | + cd libav && ./configure --enable-sram && make all install && cd .. | ||
197 | + | ||
198 | + # Tesseract | ||
199 | + cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | ||
200 | + cp -avR tessdata/* /usr/local/share/tessdata/ | ||
201 | + | ||
202 | + # cpdf, versão 2.1 ou superior | ||
203 | + cd ocaml && ./configure && make world.opt && make install && cd .. | ||
204 | + mkdir -p /usr/local/man/man5 | ||
205 | + # lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | ||
206 | + cd lib-findlib && ./configure && make all && make install && cd .. | ||
207 | + cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | ||
208 | + cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | ||
209 | + | ||
210 | + # poppler-utils, versão 0.42.0 ou superior | ||
211 | + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd .. | ||
212 | + | ||
213 | + # Centos 6.9 | ||
214 | + # \_ Termina o shell usando o GCC 4.8 por default | ||
215 | + exit | ||
216 | + | ||
217 | + | ||
218 | +## Comandos adicionais para configuração do módulo: | ||
219 | + | ||
220 | + # Criação do usuário | ||
221 | + adduser ocr | ||
222 | + | ||
223 | + # Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | ||
224 | + cp ./usr/local/bin/ocr /usr/local/bin | ||
225 | + | ||
226 | + # Auto start (RedHat 6.7 e CentOs 6.9) | ||
227 | + cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | ||
228 | + mv /etc | ||
229 | + chkconfig --add ocr | ||
230 | + chkconfig --level 2345 ocr on | ||
231 | + | ||
232 | + # Auto start (Ubuntu 14.04) | ||
233 | + cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | ||
234 | + update-rd.d ocr defaults | ||
235 | + | ||
236 | + # Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | ||
237 | + cd /home/ocr | ||
238 | + tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | ||
239 | + su | ||
240 | + | ||
241 | +# INSTALAÇÃO (obs.: os comandos devem ser executados como root) | ||
242 | + # Criação do usuário | ||
243 | + adduser ocr | ||
244 | + | ||
245 | + # Copie o pacote para os outros servidores e extraia com: | ||
246 | + cd / | ||
247 | + tar xovzf pkg-ocr.tgz | ||
248 | + | ||
249 | + # Instalando pré-requisitos RUNTIME em servidores adicionais | ||
250 | + | ||
251 | + # Redhat 6.7 e CentOS 6.9 | ||
252 | + yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript | ||
253 | + yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | ||
254 | + | ||
255 | + # Ubuntu 14.04 | ||
256 | + apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | ||
257 | + apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript | ||
258 | + | ||
259 | +# Inicie o serviço com | ||
260 | + service ocr start |
@@ -0,0 +1,17 @@ | @@ -0,0 +1,17 @@ | ||
1 | +#!/usr/bin/env bash | ||
2 | + | ||
3 | +# Inicializa serviço de log | ||
4 | +/etc/init.d/rsyslog start | ||
5 | + | ||
6 | +# Cria estrutura de pastas para monitoramento de arquivos | ||
7 | +mkdir -p /var/ocr-server/ | ||
8 | +mkdir -p /var/ocr-server/Entrada | ||
9 | +mkdir -p /var/ocr-server/Saida | ||
10 | +mkdir -p /var/ocr-server/Originais_Processados | ||
11 | +mkdir -p /var/ocr-server/Erro | ||
12 | +chmod -R 777 /var/ocr-server | ||
13 | + | ||
14 | +# Iniciar serviço do OCR-Server | ||
15 | +service ocr start | ||
16 | + | ||
17 | +tail -f /var/log/syslog | ||
0 | \ No newline at end of file | 18 | \ No newline at end of file |
usr/local/bin/ocr
1 | -#! /usr/bin/perl -w | 1 | +#!/usr/bin/perl -w |
2 | # | 2 | # |
3 | -# OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes | 3 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes |
4 | # | 4 | # |
5 | # This script monitors a set of input directories for PDF files | 5 | # This script monitors a set of input directories for PDF files |
6 | # once a new file is detected, it is processes through tesseract OCR | 6 | # once a new file is detected, it is processes through tesseract OCR |
@@ -24,15 +24,38 @@ | @@ -24,15 +24,38 @@ | ||
24 | # 1.0.1 Solving error when file has no images | 24 | # 1.0.1 Solving error when file has no images |
25 | # 1.0.2 Fix bug when counting cores for AMD processors | 25 | # 1.0.2 Fix bug when counting cores for AMD processors |
26 | # 1.0.3 Added better image type detection | 26 | # 1.0.3 Added better image type detection |
27 | -# 1.0.4 Fix: added ubuntu init script | 27 | +# 1.0.4 Fix: added ubuntu init script |
28 | +# 1.0.4b Add Centos 6.9 install instructions | ||
29 | +# 2.0 PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is | ||
30 | +# strongly recomended | ||
31 | +# Rewritten image extration, processing and transformations process | ||
32 | +# Check if input file is signed, in this case, does not change the file contents | ||
33 | +# Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | ||
34 | +# Use operating system packges by default | ||
35 | +# Changed paths from external programs, instead of using full paths, uses first match from $PATH | ||
36 | +# Check existence of external programs on path before running | ||
37 | +# Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | ||
38 | +# Fix: create subpaths on error folder | ||
39 | +# Fix: trying to reduce overhead on temporary folder | ||
28 | # | 40 | # |
29 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | 41 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
30 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | ||
31 | -# diferently but does not treat it adequately | 42 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them |
43 | +# diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | ||
32 | # - Review poppler and cpdf install instructions | 44 | # - Review poppler and cpdf install instructions |
33 | # - Add better handling of vectorized and non scanned pdf files | 45 | # - Add better handling of vectorized and non scanned pdf files |
34 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | ||
35 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | 46 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current |
47 | +# scalling, cropping and rotation handlers | ||
48 | +# - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- | ||
49 | +# added function to analyse image color histogram -> just need to add option to convert it to B&W. | ||
50 | +# - Move all parameters to config file | ||
51 | +# - Add some job control web interface | ||
52 | +# - Add end user interface to submit files through web | ||
53 | +# - Add check external programs version requirements before running | ||
54 | +# | ||
55 | +# BUGS: - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than | ||
56 | +# original, this is due to using pdftoppm instead of pdfimages | ||
57 | +# - Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions | ||
58 | +# increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server | ||
36 | # | 59 | # |
37 | # Check software requirements on the comments bellow | 60 | # Check software requirements on the comments bellow |
38 | # | 61 | # |
@@ -55,7 +78,7 @@ use IPC::Open3; | @@ -55,7 +78,7 @@ use IPC::Open3; | ||
55 | use IO::Select; | 78 | use IO::Select; |
56 | 79 | ||
57 | my $DEBUG = 0; | 80 | my $DEBUG = 0; |
58 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | 81 | +my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); |
59 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; | 82 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
60 | 83 | ||
61 | my $USER = 'ocr'; | 84 | my $USER = 'ocr'; |
@@ -63,39 +86,48 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca | @@ -63,39 +86,48 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca | ||
63 | 86 | ||
64 | # Command dependencies | 87 | # Command dependencies |
65 | 88 | ||
66 | -# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher | ||
67 | -my $TESSERACT = '/usr/local/bin/tesseract -l por+eng'; | 89 | +# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended |
90 | +my $TESSERACT = 'tesseract --oem 0'; # if Tesseract => 4.0 | ||
91 | +#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0 | ||
68 | 92 | ||
69 | # Depends on pdftk 2.02 or higher | 93 | # Depends on pdftk 2.02 or higher |
70 | -my $PDFTK = '/usr/local/bin/pdftk'; | 94 | +my $PDFTK = 'pdftk'; |
71 | 95 | ||
72 | # Depends on poppler-utils 0.42.0 or higher | 96 | # Depends on poppler-utils 0.42.0 or higher |
73 | -#my $PDINFO = '/usr/local/bin/pdfinfo'; | ||
74 | -my $PDFFONTS = '/usr/local/bin/pdffonts'; | ||
75 | -my $PDFIMAGES = '/usr/local/bin/pdfimages'; | ||
76 | -my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | 97 | +my $PDFFONTS = 'pdffonts'; |
98 | +my $PDFIMAGES = 'pdfimages'; | ||
99 | +my $PDFTOPPM = 'pdftoppm'; | ||
100 | +my $PDFUNITE = 'pdfunite'; | ||
101 | +my $PDFSIG = 'pdfsig'; | ||
77 | 102 | ||
78 | # Depends on cpdf 2.1 or higher | 103 | # Depends on cpdf 2.1 or higher |
79 | -my $CPDF = '/usr/local/bin/cpdf'; | 104 | +my $CPDF = 'cpdf'; |
105 | + | ||
106 | +# Depends on Ghostscript 9.18 | ||
107 | +my $GS = 'gs'; | ||
80 | 108 | ||
81 | ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner | 109 | ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner |
82 | -my $CONVERT = '/usr/bin/convert'; | 110 | +my $CONVERT = 'convert'; |
83 | 111 | ||
84 | # If it is needed further filtering | 112 | # If it is needed further filtering |
85 | #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | 113 | #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; |
86 | 114 | ||
87 | -my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | ||
88 | - '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); | 115 | +#my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', |
116 | +# '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); | ||
117 | + | ||
118 | +my @BASE_DIRS = ('/var/ocr-server/'); | ||
89 | 119 | ||
90 | my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); | 120 | my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); |
91 | 121 | ||
92 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); | 122 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
93 | -%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); | 123 | +%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2); |
94 | 124 | ||
95 | # Safeguard im case of cpuinfo has not identified correctly the number of CPUs | 125 | # Safeguard im case of cpuinfo has not identified correctly the number of CPUs |
96 | $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; | 126 | $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; |
97 | 127 | ||
98 | -$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; | 128 | +$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin'; |
129 | +$ENV{'IFS'} = '\t\n'; | ||
130 | + | ||
99 | my ($host) = split/\./,hostname; | 131 | my ($host) = split/\./,hostname; |
100 | 132 | ||
101 | use vars qw/*name *dir *prune/; | 133 | use vars qw/*name *dir *prune/; |
@@ -107,14 +139,15 @@ sub main; | @@ -107,14 +139,15 @@ sub main; | ||
107 | sub get_pages; | 139 | sub get_pages; |
108 | sub get_rotation; | 140 | sub get_rotation; |
109 | sub get_res; | 141 | sub get_res; |
110 | -sub is_ocred; | ||
111 | sub is_locked_ex; | 142 | sub is_locked_ex; |
112 | 143 | ||
113 | 144 | ||
114 | my $expr = 'use POSIX qw(setsid)'; | 145 | my $expr = 'use POSIX qw(setsid)'; |
115 | 146 | ||
116 | my ($dumb1, $dumb2, $uid) = getpwnam ($USER); | 147 | my ($dumb1, $dumb2, $uid) = getpwnam ($USER); |
117 | -setuid ($uid) or warn "Cant set uid $uid"; | 148 | +if (defined $uid) { |
149 | + setuid ($uid) or warn "Cant set uid $uid"; | ||
150 | +} | ||
118 | 151 | ||
119 | $SIG{__DIE__} = 'DEFAULT'; | 152 | $SIG{__DIE__} = 'DEFAULT'; |
120 | $SIG{__WARN__} = \&die_when_called; | 153 | $SIG{__WARN__} = \&die_when_called; |
@@ -126,6 +159,11 @@ if ($@) { | @@ -126,6 +159,11 @@ if ($@) { | ||
126 | chdir('/') or die "$0: cannot chdir '/': $!\n"; | 159 | chdir('/') or die "$0: cannot chdir '/': $!\n"; |
127 | open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; | 160 | open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; |
128 | 161 | ||
162 | +foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) { | ||
163 | + die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0); | ||
164 | +} | ||
165 | + | ||
166 | + | ||
129 | foreach my $DIR (@BASE_DIRS) { | 167 | foreach my $DIR (@BASE_DIRS) { |
130 | 168 | ||
131 | defined(my $pid = fork) or die "$0: cannot fork: $!\n"; | 169 | defined(my $pid = fork) or die "$0: cannot fork: $!\n"; |
@@ -135,7 +173,7 @@ foreach my $DIR (@BASE_DIRS) { | @@ -135,7 +173,7 @@ foreach my $DIR (@BASE_DIRS) { | ||
135 | main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); | 173 | main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); |
136 | exit 0; | 174 | exit 0; |
137 | last; | 175 | last; |
138 | - } | 176 | + } |
139 | } | 177 | } |
140 | 178 | ||
141 | exit 0; | 179 | exit 0; |
@@ -157,7 +195,7 @@ sub main { | @@ -157,7 +195,7 @@ sub main { | ||
157 | # remove .tmp file | 195 | # remove .tmp file |
158 | unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); | 196 | unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); |
159 | 197 | ||
160 | - # Rename files that were in 'processig' back | 198 | + # Rename files that were in 'processing' state back |
161 | foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { | 199 | foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { |
162 | my $old_name = $file; | 200 | my $old_name = $file; |
163 | $old_name =~ s/\.${host}\.processing$//g; | 201 | $old_name =~ s/\.${host}\.processing$//g; |
@@ -177,12 +215,14 @@ sub main { | @@ -177,12 +215,14 @@ sub main { | ||
177 | # Main loop | 215 | # Main loop |
178 | while ( 1 ) { | 216 | while ( 1 ) { |
179 | select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced | 217 | select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced |
218 | + | ||
180 | $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); | 219 | $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); |
181 | print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; | 220 | print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; |
182 | $count = scalar keys %files_in; | 221 | $count = scalar keys %files_in; |
183 | - foreach my $file (keys %files_in) { | ||
184 | 222 | ||
185 | - next if ( glob ("$file.*.tmp")); | 223 | + foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) { |
224 | + | ||
225 | + next if ( glob ("\"$file.*.tmp\"")); | ||
186 | 226 | ||
187 | select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds | 227 | select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds |
188 | next if (!defined $files_in{$file}); # continue only if it is still valid | 228 | next if (!defined $files_in{$file}); # continue only if it is still valid |
@@ -255,7 +295,7 @@ sub ocr { | @@ -255,7 +295,7 @@ sub ocr { | ||
255 | remove_tree ($tmpdir,{ error=> \my $dumb }); | 295 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
256 | unlink ("$in_file.$host.tmp"); | 296 | unlink ("$in_file.$host.tmp"); |
257 | move ( "$in_file.$host.processing", $in_file); | 297 | move ( "$in_file.$host.processing", $in_file); |
258 | - exit 0; | 298 | + exit 1; |
259 | }; | 299 | }; |
260 | 300 | ||
261 | my $out_path = $in_path; | 301 | my $out_path = $in_path; |
@@ -271,7 +311,7 @@ sub ocr { | @@ -271,7 +311,7 @@ sub ocr { | ||
271 | my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); | 311 | my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); |
272 | 312 | ||
273 | print "\twritting to $out_file\n" if $DEBUG; | 313 | print "\twritting to $out_file\n" if $DEBUG; |
274 | - | 314 | + |
275 | my $stime = time; | 315 | my $stime = time; |
276 | my %pids; | 316 | my %pids; |
277 | 317 | ||
@@ -291,8 +331,26 @@ sub ocr { | @@ -291,8 +331,26 @@ sub ocr { | ||
291 | remove_tree ($tmpdir,{ error=> \my $dumb }); | 331 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
292 | unlink ("$in_file.$host.tmp"); | 332 | unlink ("$in_file.$host.tmp"); |
293 | move ( "$in_file.$host.processing", $in_file); | 333 | move ( "$in_file.$host.processing", $in_file); |
334 | + print "Error: cannot copy $in_file to temp dir \n" if $DEBUG; | ||
335 | + syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG; | ||
336 | + exit 1; | ||
294 | }; | 337 | }; |
295 | 338 | ||
339 | + # Check if file was signed | ||
340 | + if (get_sign($tmp_file)) { | ||
341 | + if (!copy ("$in_file.$host.processing", $proc_file)) { | ||
342 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | ||
343 | + unlink ("$in_file.$host.tmp"); | ||
344 | + move ( "$in_file.$host.processing", $in_file); | ||
345 | + }; | ||
346 | + move ("$in_file.$host.processing", $out_file); | ||
347 | + unlink ("$in_file.$host.tmp"); | ||
348 | + print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG; | ||
349 | + syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG; | ||
350 | + | ||
351 | + exit 0; | ||
352 | + } | ||
353 | + | ||
296 | # Extract pages | 354 | # Extract pages |
297 | ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); | 355 | ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); |
298 | if ($DEBUG) { | 356 | if ($DEBUG) { |
@@ -301,12 +359,13 @@ sub ocr { | @@ -301,12 +359,13 @@ sub ocr { | ||
301 | print "\t\t\t$_" for @err ; | 359 | print "\t\t\t$_" for @err ; |
302 | }; | 360 | }; |
303 | 361 | ||
362 | + my ($pages, @pg_w, @pg_h, @pg_r, @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2); | ||
363 | + $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2); | ||
304 | 364 | ||
305 | - my ($pages, @pg_w, @pg_h, @pg_r); | ||
306 | - $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r); | 365 | + my ($imgs,@page_img, @img_w, @img_h, @img_t, @img_xppi, @img_yppi); |
366 | + $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi); | ||
307 | 367 | ||
308 | - my ($imgs,@page_img, @img_w, @img_h, @img_t); | ||
309 | - $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t); | 368 | + unlink ($tmp_file) if (!$DEBUG); |
310 | 369 | ||
311 | for ( my $i=0; $i< $pages; $i++ ) { | 370 | for ( my $i=0; $i< $pages; $i++ ) { |
312 | my $pg = sprintf ("pg_%06d", $i+1); | 371 | my $pg = sprintf ("pg_%06d", $i+1); |
@@ -333,25 +392,29 @@ sub ocr { | @@ -333,25 +392,29 @@ sub ocr { | ||
333 | if (! defined $img_t[$i] ) { | 392 | if (! defined $img_t[$i] ) { |
334 | move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); | 393 | move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); |
335 | print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; | 394 | print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; |
336 | - exit 0; | 395 | + exit -1; |
337 | } | 396 | } |
338 | 397 | ||
339 | - print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG; | 398 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG; |
399 | + print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG); | ||
400 | + print "\n" if ($DEBUG); | ||
340 | 401 | ||
402 | + # Extract images from page, since 2.0 uses png lossless format regardless of original format or depth | ||
341 | undef $cmd; | 403 | undef $cmd; |
342 | 404 | ||
343 | - if ($img_t[$i] eq "gray") { | ||
344 | - $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | 405 | + # Use PDFIMAGES and JPEG by default |
406 | + $cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
407 | + | ||
408 | + if ($img_t[$i] eq "stencil") { | ||
409 | + $cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
345 | } | 410 | } |
346 | 411 | ||
347 | - if ($img_t[$i] eq "rgb") { | ||
348 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
349 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | 412 | + if ($img_t[$i] eq "gray") { |
413 | + $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
350 | } | 414 | } |
351 | 415 | ||
352 | - if (!defined $cmd) { | ||
353 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
354 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | 416 | + if ($img_t[$i] !~ /gray|rgb|stencil/) { |
417 | + $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | ||
355 | } | 418 | } |
356 | 419 | ||
357 | ($exit,$cmd,@out,@err) = exec_cmd($cmd); | 420 | ($exit,$cmd,@out,@err) = exec_cmd($cmd); |
@@ -362,7 +425,13 @@ sub ocr { | @@ -362,7 +425,13 @@ sub ocr { | ||
362 | }; | 425 | }; |
363 | 426 | ||
364 | # Process each resulting image for page pdf | 427 | # Process each resulting image for page pdf |
365 | - my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ; | 428 | + my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ; |
429 | + | ||
430 | + if (scalar @images == 0) { | ||
431 | + move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); | ||
432 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG; | ||
433 | + exit 0; | ||
434 | + } | ||
366 | 435 | ||
367 | foreach my $image (@images) { | 436 | foreach my $image (@images) { |
368 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; | 437 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
@@ -378,43 +447,65 @@ sub ocr { | @@ -378,43 +447,65 @@ sub ocr { | ||
378 | print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; | 447 | print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; |
379 | } | 448 | } |
380 | } | 449 | } |
381 | - | ||
382 | - # Check if page was rotated | ||
383 | - if ($pg_r[$i]) { | ||
384 | - print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG; | ||
385 | - ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\""); | 450 | + |
451 | + # Check if page was rotated and extracted with pdftoppm | ||
452 | + if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) { | ||
453 | + print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG; | ||
454 | + ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\""); | ||
386 | if ($DEBUG) { | 455 | if ($DEBUG) { |
387 | print "\t\t\t${image} -> $cmd: $exit\n"; | 456 | print "\t\t\t${image} -> $cmd: $exit\n"; |
388 | print "\t\t\t\t$_" for @out ; | 457 | print "\t\t\t\t$_" for @out ; |
389 | print "\t\t\t\t$_" for @err ; | 458 | print "\t\t\t\t$_" for @err ; |
390 | }; | 459 | }; |
391 | } | 460 | } |
392 | - | 461 | + |
393 | # Filter ppm images, if needed | 462 | # Filter ppm images, if needed |
394 | 463 | ||
395 | # OCR ppm images to pdf pages | 464 | # OCR ppm images to pdf pages |
396 | - ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf"); | 465 | + ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf"); |
397 | if ($DEBUG) { | 466 | if ($DEBUG) { |
398 | print "\t\t\t${image} -> $cmd: $exit\n"; | 467 | print "\t\t\t${image} -> $cmd: $exit\n"; |
399 | print "\t\t\t\t$_" for @out ; | 468 | print "\t\t\t\t$_" for @out ; |
400 | print "\t\t\t\t$_" for @err ; | 469 | print "\t\t\t\t$_" for @err ; |
401 | }; | 470 | }; |
471 | + unlink ("$image") if (!$DEBUG); | ||
402 | 472 | ||
403 | - # Scale to fit pdf | ||
404 | - ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | 473 | + # Scale, crop and rotate to fit pdf |
474 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | ||
405 | if ($DEBUG) { | 475 | if ($DEBUG) { |
406 | print "\t\t\t${image} -> $cmd: $exit\n"; | 476 | print "\t\t\t${image} -> $cmd: $exit\n"; |
407 | print "\t\t\t\t$_" for @out ; | 477 | print "\t\t\t\t$_" for @out ; |
408 | print "\t\t\t\t$_" for @err ; | 478 | print "\t\t\t\t$_" for @err ; |
409 | }; | 479 | }; |
480 | + unlink ("$image.pdf") if (!$DEBUG); | ||
410 | 481 | ||
482 | + if (defined $pg_crop_x1[$i]) { | ||
483 | + # adjust cropbox | ||
484 | + ($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = ( | ||
485 | + ($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]), | ||
486 | + ($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]), | ||
487 | + abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i]) | ||
488 | + ); | ||
489 | + | ||
490 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | ||
491 | + if ($DEBUG) { | ||
492 | + print "\t\t\t${image} -> $cmd: $exit\n"; | ||
493 | + print "\t\t\t\t$_" for @out ; | ||
494 | + print "\t\t\t\t$_" for @err ; | ||
495 | + }; | ||
496 | + } | ||
497 | + | ||
498 | + if ($pg_r[$i]) { | ||
499 | + ($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | ||
500 | + if ($DEBUG) { | ||
501 | + print "\t\t\t${image} -> $cmd: $exit\n"; | ||
502 | + print "\t\t\t\t$_" for @out ; | ||
503 | + print "\t\t\t\t$_" for @err ; | ||
504 | + }; | ||
505 | + } | ||
411 | 506 | ||
412 | - unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG); | ||
413 | - unlink ("$image.pdf") if (!$DEBUG); | ||
414 | - move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG); | ||
415 | - unlink ("$image") if (!$DEBUG); | ||
416 | } | 507 | } |
417 | - exit 0; | 508 | + exit 1; |
418 | } | 509 | } |
419 | } | 510 | } |
420 | 511 | ||
@@ -427,28 +518,51 @@ sub ocr { | @@ -427,28 +518,51 @@ sub ocr { | ||
427 | 518 | ||
428 | if (scalar @new_pages != $pages) { | 519 | if (scalar @new_pages != $pages) { |
429 | print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); | 520 | print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); |
430 | - syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG); | 521 | + syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG); |
431 | unlink "$in_file.$host.tmp"; | 522 | unlink "$in_file.$host.tmp"; |
523 | + make_path ($error_path) if ( ! -d $error_path); | ||
432 | move ("$in_file.$host.processing", $error_file); | 524 | move ("$in_file.$host.processing", $error_file); |
433 | - exit (0); | 525 | + exit (1); |
434 | } | 526 | } |
435 | 527 | ||
436 | - # Merge resulting pdf pages to a single pdf | 528 | + # Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output |
437 | make_path ($out_path) if ( ! -d $out_path); | 529 | make_path ($out_path) if ( ! -d $out_path); |
438 | unlink $out_file if ( -f $out_file ); | 530 | unlink $out_file if ( -f $out_file ); |
439 | - ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress"); | 531 | + |
532 | + chdir (${tmpdir}); | ||
533 | + ($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\" pg_*-cpdf.pdf "); | ||
440 | if ($DEBUG) { | 534 | if ($DEBUG) { |
441 | print "\t\t${out_file} -> $cmd: $exit\n"; | 535 | print "\t\t${out_file} -> $cmd: $exit\n"; |
442 | print "\t\t\t$_" for @out ; | 536 | print "\t\t\t$_" for @out ; |
443 | print "\t\t\t$_" for @err ; | 537 | print "\t\t\t$_" for @err ; |
444 | }; | 538 | }; |
539 | + if ($exit) { | ||
540 | + unlink "$in_file.$host.tmp"; | ||
541 | + unlink $out_file; | ||
542 | + make_path ($error_path) if ( ! -d $error_path); | ||
543 | + move ("$in_file.$host.processing", $error_file); | ||
544 | + print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); | ||
545 | + syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG); | ||
546 | + exit (1); | ||
547 | + } | ||
548 | + chdir ("/"); | ||
549 | + | ||
550 | + if (!copy (${tmp_file}, $out_file)) { | ||
551 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | ||
552 | + unlink ("$in_file.$host.tmp"); | ||
553 | + unlink $out_file; | ||
554 | + make_path ($error_path) if ( ! -d $error_path); | ||
555 | + move ("$in_file.$host.processing", $error_file); | ||
556 | + print "Error: cannot copy temp file to $out_file \n" if $DEBUG; | ||
557 | + syslog ("error","cannot copy temp file to $out_file") if !$DEBUG; | ||
558 | + exit 1; | ||
559 | + }; | ||
445 | 560 | ||
446 | make_path ($proc_path) if ( ! -d $proc_path); | 561 | make_path ($proc_path) if ( ! -d $proc_path); |
447 | unlink $proc_file if ( -f $proc_file ); | 562 | unlink $proc_file if ( -f $proc_file ); |
448 | move ("$in_file.$host.processing", $proc_file); | 563 | move ("$in_file.$host.processing", $proc_file); |
449 | move ("${out_file}.tmp", ${out_file}); | 564 | move ("${out_file}.tmp", ${out_file}); |
450 | 565 | ||
451 | - | ||
452 | # Remove temp dir | 566 | # Remove temp dir |
453 | remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); | 567 | remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); |
454 | unlink $tmp_file if (!$DEBUG); | 568 | unlink $tmp_file if (!$DEBUG); |
@@ -471,7 +585,7 @@ sub is_ocred { | @@ -471,7 +585,7 @@ sub is_ocred { | ||
471 | } | 585 | } |
472 | 586 | ||
473 | sub get_pages { | 587 | sub get_pages { |
474 | - my ($in_file, $w, $h, $r) = @_; | 588 | + my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_; |
475 | 589 | ||
476 | my $pages=0; | 590 | my $pages=0; |
477 | my $i=0; | 591 | my $i=0; |
@@ -485,29 +599,35 @@ sub get_pages { | @@ -485,29 +599,35 @@ sub get_pages { | ||
485 | ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); | 599 | ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); |
486 | ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); | 600 | ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); |
487 | ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); | 601 | ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); |
488 | - ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); | 602 | + ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); |
603 | + ($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ ); | ||
489 | } | 604 | } |
490 | 605 | ||
491 | return $pages; | 606 | return $pages; |
492 | } | 607 | } |
493 | 608 | ||
494 | sub get_imgs { | 609 | sub get_imgs { |
495 | - my ($in_file, $page_img, $w, $h, $t) = @_; | ||
496 | - my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); | 610 | + my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_; |
611 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi ); | ||
497 | 612 | ||
498 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); | 613 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
614 | + $i = 0; | ||
499 | 615 | ||
500 | foreach my $line (@lines) { | 616 | foreach my $line (@lines) { |
501 | chomp $line; | 617 | chomp $line; |
502 | $line =~ s/^ {1,}//; | 618 | $line =~ s/^ {1,}//; |
503 | - if ( $line =~ /image|mask/ ) { | ||
504 | - ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; | 619 | + if ( $line !~ /^page|^----/ ) { |
620 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line; | ||
505 | @$page_img[$page-1]=$i; | 621 | @$page_img[$page-1]=$i; |
506 | @$w[$page-1] = $width; | 622 | @$w[$page-1] = $width; |
507 | @$h[$page-1] = $height; | 623 | @$h[$page-1] = $height; |
508 | @$t[$page-1] = "rgb"; # Default is color | 624 | @$t[$page-1] = "rgb"; # Default is color |
509 | - @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | ||
510 | @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); | 625 | @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); |
626 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | ||
627 | + @$t[$page-1] = ( $type eq "stencil" ? $type : @$t[$page-1]); | ||
628 | + @$t[$page-1] = ( $enc eq "image" ? $enc : @$t[$page-1]); | ||
629 | + @$x_ppi[$page-1] = $xppi; | ||
630 | + @$y_ppi[$page-1] = $yppi; | ||
511 | } | 631 | } |
512 | } | 632 | } |
513 | return $i+1; | 633 | return $i+1; |
@@ -542,6 +662,19 @@ sub get_res { | @@ -542,6 +662,19 @@ sub get_res { | ||
542 | return ($res_x,$res_y); | 662 | return ($res_x,$res_y); |
543 | } | 663 | } |
544 | 664 | ||
665 | +sub get_sign { | ||
666 | + my ($in_file) = @_; | ||
667 | + my @lines = `${PDFSIG} \"${in_file}\" 2>/dev/null`; | ||
668 | + | ||
669 | + foreach (@lines) { | ||
670 | + chomp; | ||
671 | + if ( $_ =~ /^Signature/ ) { | ||
672 | + return 1; | ||
673 | + } | ||
674 | + } | ||
675 | + return 0; | ||
676 | +} | ||
677 | + | ||
545 | sub is_locked_ex { | 678 | sub is_locked_ex { |
546 | my ($path) = @_; | 679 | my ($path) = @_; |
547 | 680 |
workflow.pdf
No preview for this file type
workflow.vsd
No preview for this file type