Commit d2b74559c9a3595de14ad8ce6ba843d92f505213

Authored by Nei Jobson da Costa Carneiro
1 parent 78ec197b

Pré versão 2.0 a ser liberada - PARA O CANTONI INCLUIR O Container Docker

INSTALL.txt
... ... @@ -1,202 +0,0 @@
1   -# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees
2   -#
3   -# This script monitors a set of input directories for PDF files
4   -# once a new file is detected, it is processes through tesseract OCR
5   -# in order to generate a new file with a hidden searchable text layer
6   -#
7   -# It may be distributed under the conditions of the LGPL v2.1 license.
8   -#
9   -# Author: Guilherme Chehab
10   -#
11   -# Version History:
12   -# 0.1 Initial single server version
13   -# 0.2 Check if page already has the html hidden layer, if so, ignore it
14   -# 0.3 Solved issues about various image enconding types
15   -# 0.4 Added a postnormalization step to ensure all output pdf pages have
16   -# the same size and orientations as the original files
17   -# 0.5 Used input file renaming as a way to sync multiple parallel instances,
18   -# that way, it is minimized the risk of same file being OCRed multiple times.
19   -# 0.6 Added a default handler for unknown image encoding using jpeg encoding
20   -# 0.7 Solved an issue with files with more than 1000 pages
21   -# 1.0 First release version
22   -# 1.0.1 Solving error when file has no images
23   -# 1.0.2 Fix bug when counting cores for AMD processors
24   -# 1.0.3 Added better image type detection
25   -# 1.0.4 Fix: added ubuntu init script
26   -# 1.0.4b Centos 6.9
27   -#
28   -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
29   -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
30   -# diferently but does not treat it adequately
31   -# - Review poppler and cpdf install instructions
32   -# - Add better handling of vectorized and non scanned pdf files
33   -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
34   -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
35   -#
36   -# Check software requirements on the comments bellow
37   -#
38   -# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables
39   -#
40   -#
41   -# O servidor OCR depende dos seguintes componentes:
42   -# - Perl 5.10.1, com seguintes módulos:
43   -# - File::Find::Rule
44   -# - File::Basename
45   -# - File::Copy
46   -# - File::Path
47   -# - File::Touch
48   -# - Sys::Syslog
49   -# - Sys::Hostname
50   -# - IPC::Open3
51   -# - IO::Select
52   -# - POSIX
53   -# - Tesseract-ocr 3.05, com dicionários inglês e português
54   -# - Pdftk 2.02
55   -# - Poppler-utils 0.42.0
56   -# - Cpdf 2.1
57   -# - ImageMagick 6.7.2-7
58   -#
59   -# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
60   -# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
61   -# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
62   -#
63   -## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
64   -#
65   -# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
66   -#
67   -# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script
68   -# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
69   -# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
70   -# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
71   -# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
72   -# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
73   -# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
74   -#
75   -# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
76   -#
77   -#
78   -# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root)
79   -#
80   -#
81   -# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS
82   -#
83   -# RedHat 6.7 e Centos 6.9:
84   -yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
85   -yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel
86   -cd /tmp
87   -wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
88   -rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
89   -rm -f msttcore-fonts-2.0-3.noarch.rpm
90   -
91   -# Centos 6.9
92   -# \_ autoconf-archive
93   -wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
94   -rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
95   -rm autoconf-archive-2012.04.07-7.3.noarch.rpm
96   -# \_ GCC 4.8
97   -wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
98   -yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
99   -
100   -# Ubuntu 14.04 Server:
101   -apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14
102   -apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev
103   -apt-get install ttf-mscorefonts-installer
104   -
105   -# Ambas plataformas:
106   -cd /usr/local/src
107   -
108   -for i in \
109   - https://github.com/tesseract-ocr/langdata.git \
110   - https://github.com/DanBloomberg/leptonica.git \
111   - https://github.com/libav/libav.git \
112   - https://github.com/tesseract-ocr/tessdata.git \
113   - https://github.com/tesseract-ocr/tesseract.git \
114   - git://git.freedesktop.org/git/poppler/poppler.git \
115   - git://git.freedesktop.org/git/poppler/test.git \
116   - https://github.com/Flameeyes/unpaper.git \
117   - https://github.com/ocaml/ocaml.git \
118   - https://gitlab.camlcity.org/gerd/lib-findlib.git \
119   - https://github.com/johnwhitington/camlpdf.git \
120   - https://github.com/johnwhitington/cpdf-source.git \
121   -; do git clone $i; done
122   -
123   -wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
124   -unzip pdftk-2.02-src.zip
125   -rm -f pdftk-2.02-src.zip
126   -
127   -# pdftk, versão 2.02 ou superior
128   -cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
129   -
130   -# Centos 6.9
131   -# \_ Cria um novo shell usando o GCC 4.8 por default
132   -scl enable devtoolset-2 bash
133   -
134   -# Tesseract, versão 3.05-dev ou superior
135   -# Bibliotecas para o Tesseract: Leptonica e Libav
136   -cd leptonica && ./autobuild && ./configure && make all install && cd ..
137   -
138   -# Para compilação do Tesseract após a compilação do leptonica
139   -export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
140   -
141   -cd libav && ./configure --enable-sram && make all install && cd ..
142   -
143   -# Tesseract
144   -cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
145   -cp -avR tessdata/* /usr/local/share/tessdata/
146   -
147   -# cpdf, versão 2.1 ou superior
148   -cd ocaml && ./configure && make world.opt && make install && cd ..
149   -mkdir -p /usr/local/man/man5
150   -# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente
151   -cd lib-findlib && ./configure && make all && make install && cd ..
152   -cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
153   -cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
154   -
155   -# poppler-utils, versão 0.42.0 ou superior
156   -cd poppler && ./autogen.sh && ./configure && make all install && cd ..
157   -
158   -# Centos 6.9
159   -# \_ Termina o shell usando o GCC 4.8 por default
160   -exit
161   -
162   -# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root)
163   -
164   -## Comandos adicionais para configuração do módulo:
165   -
166   -# Criação do usuário
167   -adduser ocr
168   -
169   -# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
170   -cp ./usr/local/bin/ocr /usr/local/bin
171   -
172   -# Auto start (RedHat 6.7 e CentOs 6.9)
173   -cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr
174   -mv /etc
175   -chkconfig --add ocr
176   -chkconfig --level 2345 ocr on
177   -
178   -# Auto start (Ubuntu 14.04)
179   -cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
180   -update-rd.d ocr defaults
181   -
182   -# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
183   -cd /home/ocr
184   -tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
185   -su
186   -
187   -# Copie o pacote para os outros servidores e extraia com:
188   -cd /
189   -tar xovzf pkg-ocr.tgz
190   -
191   -# Instalando pré-requisitos RUNTIME em servidores adicionais
192   -
193   -# Redhat 6.7 e CentOS 6.9
194   -yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp
195   -yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext
196   -
197   -# Ubuntu 14.04
198   -apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14
199   -apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0
200   -
201   -# Inicie o serviço com
202   -service ocr start
README.md 0 → 100644
... ... @@ -0,0 +1,240 @@
  1 +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees
  2 +
  3 +This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer
  4 +
  5 +It may be distributed under the conditions of the LGPL v2.1 license.
  6 +
  7 +Author: Guilherme Chehab
  8 +
  9 +## Version History:
  10 + - 0.1
  11 + - Initial single server version
  12 + - 0.2
  13 + - Check if page already has the html hidden layer, if so, ignore it
  14 + - 0.3
  15 + - Solved issues about various image enconding types
  16 + - 0.4
  17 + - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files
  18 + - 0.5
  19 + - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times.
  20 + - 0.6
  21 + - Added a default handler for unknown image encoding using jpeg encoding
  22 + - 0.7
  23 + - Solved an issue with files with more than 1000 pages
  24 + - 1.0
  25 + - First release version
  26 + - 1.0.1 Solving error when file has no images
  27 + - 1.0.2 Fix bug when counting cores for AMD processors
  28 + - 1.0.3 Added better image type detection
  29 + - 1.0.4 Fix: added ubuntu init script
  30 + - 1.0.4b Add Centos 6.9 install instructions
  31 + - 2.0
  32 + - PDF/A output, and better compression with ghostscript
  33 + - Rewritten image extration, processing and transformations process
  34 + - Check if input file is signed, in this case, does not change the file contents
  35 + - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
  36 + - Use operating system packges by default
  37 + - Changed paths from external programs, instead of using full paths, uses first match from $PATH
  38 + - Check existence of external programs on path before running
  39 + - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
  40 + - Fix: create subpaths on error folder
  41 + - Fix: trying to reduce overhead on temporary folder
  42 + - TODO:
  43 + - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
  44 + - Review poppler and cpdf install instructions
  45 + - Add better handling of vectorized and non scanned pdf files
  46 + - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers
  47 + - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W.
  48 + - Move all parameters to config file
  49 + - Add some job control web interface
  50 + - Add end user interface to submit files through web
  51 + - Add check external programs version requirements before running
  52 + - BUGS:
  53 + - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages
  54 +
  55 +## Check software requirements on the comments bellow
  56 +
  57 +To configure input dirs change @BASE_DIRS and @SUB_DIRS variables
  58 +
  59 +### O servidor OCR depende dos seguintes componentes:
  60 + - Perl 5.10.1, com seguintes módulos:
  61 + - File::Find::Rule
  62 + - File::Basename
  63 + - File::Copy
  64 + - File::Path
  65 + - File::Touch
  66 + - Sys::Syslog
  67 + - Sys::Hostname
  68 + - IPC::Open3
  69 + - IO::Select
  70 + - POSIX
  71 + - Tesseract-ocr 3.05, com dicionários inglês e português
  72 + - Pdftk 2.02
  73 + - Poppler-utils 0.42.0
  74 + - Cpdf 2.1
  75 + - ImageMagick 6.7.2-7
  76 + - Ghostcript 9.18
  77 +
  78 +Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
  79 +
  80 +Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
  81 +
  82 +Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
  83 +
  84 +ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
  85 +
  86 +### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
  87 +
  88 +- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script
  89 +- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
  90 +- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
  91 +- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
  92 +
  93 +Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
  94 +
  95 +Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
  96 +
  97 +A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
  98 +
  99 +Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
  100 +
  101 +# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root)
  102 +
  103 +Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial)
  104 +são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada
  105 +
  106 +Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries
  107 +
  108 +## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS
  109 +
  110 +### RedHat 6.7 e Centos 6.9:
  111 + yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
  112 + yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel
  113 + cd /tmp
  114 + wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
  115 + rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
  116 + rm -f msttcore-fonts-2.0-3.noarch.rpm
  117 +
  118 +### Centos 6.9
  119 +# \_ autoconf-archive
  120 + wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
  121 + rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
  122 + rm autoconf-archive-2012.04.07-7.3.noarch.rpm
  123 +# \_ GCC 4.8
  124 + wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
  125 + yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
  126 +
  127 +# Ubuntu 14.04 Server:
  128 + apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14
  129 + apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev
  130 + apt-get install ttf-mscorefonts-installer
  131 +
  132 +# Ambas plataformas:
  133 + cd /usr/local/src
  134 +
  135 + for i in \
  136 + https://github.com/tesseract-ocr/langdata.git \
  137 + https://github.com/DanBloomberg/leptonica.git \
  138 + https://github.com/libav/libav.git \
  139 + https://github.com/tesseract-ocr/tessdata.git \
  140 + https://github.com/tesseract-ocr/tesseract.git \
  141 + git://git.freedesktop.org/git/poppler/poppler.git \
  142 + git://git.freedesktop.org/git/poppler/test.git \
  143 + https://github.com/Flameeyes/unpaper.git \
  144 + https://github.com/ocaml/ocaml.git \
  145 + https://gitlab.camlcity.org/gerd/lib-findlib.git \
  146 + https://github.com/johnwhitington/camlpdf.git \
  147 + https://github.com/johnwhitington/cpdf-source.git \
  148 + http://git.ghostscript.com/ghostpdl.git \
  149 + ; do git clone $i; done
  150 +
  151 + wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
  152 + unzip pdftk-2.02-src.zip
  153 + rm -f pdftk-2.02-src.zip
  154 +
  155 +# pdftk, versão 2.02 ou superior
  156 +cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
  157 +
  158 +# Ghostscript 9.18 ou superior
  159 +#wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz
  160 +#tar xvozf ghostscript-9.21.tar.gz
  161 +#rm -f ghostscript-9.21.tar.gz
  162 +#cd ghostscript-9.21
  163 +cd ghostpdl
  164 +./autogen.sh; ./configure
  165 +make all install
  166 +cd ..
  167 +
  168 +# Centos 6.9
  169 +# \_ Cria um novo shell usando o GCC 4.8 por default
  170 +scl enable devtoolset-2 bash
  171 +
  172 +# Tesseract, versão 3.05-dev ou superior
  173 +# Bibliotecas para o Tesseract: Leptonica e Libav
  174 +cd leptonica && ./autobuild && ./configure && make all install && cd ..
  175 +
  176 +# Para compilação do Tesseract após a compilação do leptonica
  177 +export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
  178 +
  179 +cd libav && ./configure --enable-sram && make all install && cd ..
  180 +
  181 +# Tesseract
  182 +cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
  183 +cp -avR tessdata/* /usr/local/share/tessdata/
  184 +
  185 +# cpdf, versão 2.1 ou superior
  186 +cd ocaml && ./configure && make world.opt && make install && cd ..
  187 +mkdir -p /usr/local/man/man5
  188 +# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente
  189 +cd lib-findlib && ./configure && make all && make install && cd ..
  190 +cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
  191 +cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
  192 +
  193 +# poppler-utils, versão 0.42.0 ou superior
  194 +cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd ..
  195 +
  196 +# Centos 6.9
  197 +# \_ Termina o shell usando o GCC 4.8 por default
  198 +exit
  199 +
  200 +# ----------------------- INSTALAÇÃO (obs.: os comandos devem ser executados como root)
  201 +
  202 +## Comandos adicionais para configuração do módulo:
  203 +
  204 +# Criação do usuário
  205 +adduser ocr
  206 +
  207 +# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
  208 +cp ./usr/local/bin/ocr /usr/local/bin
  209 +
  210 +# Auto start (RedHat 6.7 e CentOs 6.9)
  211 +cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr
  212 +mv /etc
  213 +chkconfig --add ocr
  214 +chkconfig --level 2345 ocr on
  215 +
  216 +# Auto start (Ubuntu 14.04)
  217 +cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
  218 +update-rd.d ocr defaults
  219 +
  220 +# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
  221 +cd /home/ocr
  222 +tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
  223 +su
  224 +
  225 +# Copie o pacote para os outros servidores e extraia com:
  226 +cd /
  227 +tar xovzf pkg-ocr.tgz
  228 +
  229 +# Instalando pré-requisitos RUNTIME em servidores adicionais
  230 +
  231 +# Redhat 6.7 e CentOS 6.9
  232 +yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript
  233 +yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext
  234 +
  235 +# Ubuntu 14.04
  236 +apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14
  237 +apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript
  238 +
  239 +# Inicie o serviço com
  240 +service ocr start
... ...
usr/local/bin/ocr
1   -#! /usr/bin/perl -w
  1 +#!/usr/bin/perl -w
2 2 #
3   -# OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes
  3 +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes
4 4 #
5 5 # This script monitors a set of input directories for PDF files
6 6 # once a new file is detected, it is processes through tesseract OCR
... ... @@ -24,15 +24,38 @@
24 24 # 1.0.1 Solving error when file has no images
25 25 # 1.0.2 Fix bug when counting cores for AMD processors
26 26 # 1.0.3 Added better image type detection
27   -# 1.0.4 Fix: added ubuntu init script
  27 +# 1.0.4 Fix: added ubuntu init script
  28 +# 1.0.4b Add Centos 6.9 install instructions
  29 +# 2.0 PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is
  30 +# strongly recomended
  31 +# Rewritten image extration, processing and transformations process
  32 +# Check if input file is signed, in this case, does not change the file contents
  33 +# Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
  34 +# Use operating system packges by default
  35 +# Changed paths from external programs, instead of using full paths, uses first match from $PATH
  36 +# Check existence of external programs on path before running
  37 +# Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
  38 +# Fix: create subpaths on error folder
  39 +# Fix: trying to reduce overhead on temporary folder
28 40 #
29 41 # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
30   -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
31   -# diferently but does not treat it adequately
  42 +# would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them
  43 +# diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
32 44 # - Review poppler and cpdf install instructions
33 45 # - Add better handling of vectorized and non scanned pdf files
34   -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
35   -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
  46 +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current
  47 +# scalling, cropping and rotation handlers
  48 +# - Check mean saturation for additional colored images detection and automatically convert to B&W if possible --
  49 +# added function to analyse image color histogram -> just need to add option to convert it to B&W.
  50 +# - Move all parameters to config file
  51 +# - Add some job control web interface
  52 +# - Add end user interface to submit files through web
  53 +# - Add check external programs version requirements before running
  54 +#
  55 +# BUGS: - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than
  56 +# original, this is due to using pdftoppm instead of pdfimages
  57 +# - Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions
  58 +# increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server
36 59 #
37 60 # Check software requirements on the comments bellow
38 61 #
... ... @@ -54,8 +77,8 @@ use Sys::Hostname;
54 77 use IPC::Open3;
55 78 use IO::Select;
56 79  
57   -my $DEBUG = 0;
58   -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`);
  80 +my $DEBUG = 2;
  81 +my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`);
59 82 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ;
60 83  
61 84 my $USER = 'ocr';
... ... @@ -63,23 +86,28 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca
63 86  
64 87 # Command dependencies
65 88  
66   -# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher
67   -my $TESSERACT = '/usr/local/bin/tesseract -l por+eng';
  89 +# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended
  90 +my $TESSERACT = 'tesseract --oem 0'; # if Tesseract => 4.0
  91 +#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0
68 92  
69 93 # Depends on pdftk 2.02 or higher
70   -my $PDFTK = '/usr/local/bin/pdftk';
  94 +my $PDFTK = 'pdftk';
71 95  
72 96 # Depends on poppler-utils 0.42.0 or higher
73   -#my $PDINFO = '/usr/local/bin/pdfinfo';
74   -my $PDFFONTS = '/usr/local/bin/pdffonts';
75   -my $PDFIMAGES = '/usr/local/bin/pdfimages';
76   -my $PDFTOPPM = '/usr/local/bin/pdftoppm';
  97 +my $PDFFONTS = 'pdffonts';
  98 +my $PDFIMAGES = 'pdfimages';
  99 +my $PDFTOPPM = 'pdftoppm';
  100 +my $PDFUNITE = 'pdfunite';
  101 +my $PDFSIG = 'pdfsig';
77 102  
78 103 # Depends on cpdf 2.1 or higher
79   -my $CPDF = '/usr/local/bin/cpdf';
  104 +my $CPDF = 'cpdf';
  105 +
  106 +# Depends on Ghostscript 9.18
  107 +my $GS = 'gs';
80 108  
81 109 ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner
82   -my $CONVERT = '/usr/bin/convert';
  110 +my $CONVERT = 'convert';
83 111  
84 112 # If it is needed further filtering
85 113 #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';
... ... @@ -90,12 +118,14 @@ my @BASE_DIRS = ( &#39;/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/&#39;,
90 118 my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' );
91 119  
92 120 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2);
93   -%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG);
  121 +%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2);
94 122  
95 123 # Safeguard im case of cpuinfo has not identified correctly the number of CPUs
96 124 $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS;
97 125  
98   -$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin';
  126 +$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin';
  127 +$ENV{'IFS'} = '\t\n';
  128 +
99 129 my ($host) = split/\./,hostname;
100 130  
101 131 use vars qw/*name *dir *prune/;
... ... @@ -107,14 +137,15 @@ sub main;
107 137 sub get_pages;
108 138 sub get_rotation;
109 139 sub get_res;
110   -sub is_ocred;
111 140 sub is_locked_ex;
112 141  
113 142  
114 143 my $expr = 'use POSIX qw(setsid)';
115 144  
116 145 my ($dumb1, $dumb2, $uid) = getpwnam ($USER);
117   -setuid ($uid) or warn "Cant set uid $uid";
  146 +if (defined $uid) {
  147 + setuid ($uid) or warn "Cant set uid $uid";
  148 +}
118 149  
119 150 $SIG{__DIE__} = 'DEFAULT';
120 151 $SIG{__WARN__} = \&die_when_called;
... ... @@ -126,6 +157,11 @@ if ($@) {
126 157 chdir('/') or die "$0: cannot chdir '/': $!\n";
127 158 open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n";
128 159  
  160 +foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) {
  161 + die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0);
  162 +}
  163 +
  164 +
129 165 foreach my $DIR (@BASE_DIRS) {
130 166  
131 167 defined(my $pid = fork) or die "$0: cannot fork: $!\n";
... ... @@ -135,7 +171,7 @@ foreach my $DIR (@BASE_DIRS) {
135 171 main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR});
136 172 exit 0;
137 173 last;
138   - }
  174 + }
139 175 }
140 176  
141 177 exit 0;
... ... @@ -157,7 +193,7 @@ sub main {
157 193 # remove .tmp file
158 194 unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) );
159 195  
160   - # Rename files that were in 'processig' back
  196 + # Rename files that were in 'processing' state back
161 197 foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) {
162 198 my $old_name = $file;
163 199 $old_name =~ s/\.${host}\.processing$//g;
... ... @@ -177,12 +213,14 @@ sub main {
177 213 # Main loop
178 214 while ( 1 ) {
179 215 select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced
  216 +
180 217 $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} ));
181 218 print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in;
182 219 $count = scalar keys %files_in;
183   - foreach my $file (keys %files_in) {
184 220  
185   - next if ( glob ("$file.*.tmp"));
  221 + foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) {
  222 +
  223 + next if ( glob ("\"$file.*.tmp\""));
186 224  
187 225 select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds
188 226 next if (!defined $files_in{$file}); # continue only if it is still valid
... ... @@ -255,7 +293,7 @@ sub ocr {
255 293 remove_tree ($tmpdir,{ error=> \my $dumb });
256 294 unlink ("$in_file.$host.tmp");
257 295 move ( "$in_file.$host.processing", $in_file);
258   - exit 0;
  296 + exit 1;
259 297 };
260 298  
261 299 my $out_path = $in_path;
... ... @@ -271,7 +309,7 @@ sub ocr {
271 309 my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: "");
272 310  
273 311 print "\twritting to $out_file\n" if $DEBUG;
274   -
  312 +
275 313 my $stime = time;
276 314 my %pids;
277 315  
... ... @@ -291,8 +329,26 @@ sub ocr {
291 329 remove_tree ($tmpdir,{ error=> \my $dumb });
292 330 unlink ("$in_file.$host.tmp");
293 331 move ( "$in_file.$host.processing", $in_file);
  332 + print "Error: cannot copy $in_file to temp dir \n" if $DEBUG;
  333 + syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG;
  334 + exit 1;
294 335 };
295 336  
  337 + # Check if file was signed
  338 + if (get_sign($tmp_file)) {
  339 + if (!copy ("$in_file.$host.processing", $proc_file)) {
  340 + remove_tree ($tmpdir,{ error=> \my $dumb });
  341 + unlink ("$in_file.$host.tmp");
  342 + move ( "$in_file.$host.processing", $in_file);
  343 + };
  344 + move ("$in_file.$host.processing", $out_file);
  345 + unlink ("$in_file.$host.tmp");
  346 + print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG;
  347 + syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG;
  348 +
  349 + exit 0;
  350 + }
  351 +
296 352 # Extract pages
297 353 ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf");
298 354 if ($DEBUG) {
... ... @@ -301,12 +357,13 @@ sub ocr {
301 357 print "\t\t\t$_" for @err ;
302 358 };
303 359  
  360 + my ($pages, @pg_w, @pg_h, @pg_r, @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2);
  361 + $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2);
304 362  
305   - my ($pages, @pg_w, @pg_h, @pg_r);
306   - $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r);
  363 + my ($imgs,@page_img, @img_w, @img_h, @img_t, @img_xppi, @img_yppi);
  364 + $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi);
307 365  
308   - my ($imgs,@page_img, @img_w, @img_h, @img_t);
309   - $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t);
  366 + unlink ($tmp_file) if (!$DEBUG);
310 367  
311 368 for ( my $i=0; $i< $pages; $i++ ) {
312 369 my $pg = sprintf ("pg_%06d", $i+1);
... ... @@ -333,25 +390,29 @@ sub ocr {
333 390 if (! defined $img_t[$i] ) {
334 391 move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
335 392 print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG;
336   - exit 0;
  393 + exit -1;
337 394 }
338 395  
339   - print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG;
  396 + print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG;
  397 + print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG);
  398 + print "\n" if ($DEBUG);
340 399  
  400 + # Extract images from page, since 2.0 uses png lossless format regardless of original format or depth
341 401 undef $cmd;
342 402  
343   - if ($img_t[$i] eq "gray") {
344   - $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
  403 + # Use PDFIMAGES and JPEG by default
  404 + $cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
  405 +
  406 + if ($img_t[$i] eq "stencil") {
  407 + $cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
345 408 }
346 409  
347   - if ($img_t[$i] eq "rgb") {
348   - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
349   - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM
  410 + if ($img_t[$i] eq "gray") {
  411 + $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
350 412 }
351 413  
352   - if (!defined $cmd) {
353   - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
354   - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM
  414 + if ($img_t[$i] !~ /gray|rgb|stencil/) {
  415 + $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
355 416 }
356 417  
357 418 ($exit,$cmd,@out,@err) = exec_cmd($cmd);
... ... @@ -362,7 +423,13 @@ sub ocr {
362 423 };
363 424  
364 425 # Process each resulting image for page pdf
365   - my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ;
  426 + my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ;
  427 +
  428 + if (scalar @images == 0) {
  429 + move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
  430 + print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG;
  431 + exit 0;
  432 + }
366 433  
367 434 foreach my $image (@images) {
368 435 print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG;
... ... @@ -378,43 +445,65 @@ sub ocr {
378 445 print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n";
379 446 }
380 447 }
381   -
382   - # Check if page was rotated
383   - if ($pg_r[$i]) {
384   - print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG;
385   - ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\"");
  448 +
  449 + # Check if page was rotated and extracted with pdftoppm
  450 + if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) {
  451 + print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG;
  452 + ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\"");
386 453 if ($DEBUG) {
387 454 print "\t\t\t${image} -> $cmd: $exit\n";
388 455 print "\t\t\t\t$_" for @out ;
389 456 print "\t\t\t\t$_" for @err ;
390 457 };
391 458 }
392   -
  459 +
393 460 # Filter ppm images, if needed
394 461  
395 462 # OCR ppm images to pdf pages
396   - ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf");
  463 + ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf");
397 464 if ($DEBUG) {
398 465 print "\t\t\t${image} -> $cmd: $exit\n";
399 466 print "\t\t\t\t$_" for @out ;
400 467 print "\t\t\t\t$_" for @err ;
401 468 };
  469 + unlink ("$image") if (!$DEBUG);
402 470  
403   - # Scale to fit pdf
404   - ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
  471 + # Scale, crop and rotate to fit pdf
  472 + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
405 473 if ($DEBUG) {
406 474 print "\t\t\t${image} -> $cmd: $exit\n";
407 475 print "\t\t\t\t$_" for @out ;
408 476 print "\t\t\t\t$_" for @err ;
409 477 };
  478 + unlink ("$image.pdf") if (!$DEBUG);
410 479  
  480 + if (defined $pg_crop_x1[$i]) {
  481 + # adjust cropbox
  482 + ($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = (
  483 + ($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]),
  484 + ($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]),
  485 + abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i])
  486 + );
  487 +
  488 + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
  489 + if ($DEBUG) {
  490 + print "\t\t\t${image} -> $cmd: $exit\n";
  491 + print "\t\t\t\t$_" for @out ;
  492 + print "\t\t\t\t$_" for @err ;
  493 + };
  494 + }
  495 +
  496 + if ($pg_r[$i]) {
  497 + ($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
  498 + if ($DEBUG) {
  499 + print "\t\t\t${image} -> $cmd: $exit\n";
  500 + print "\t\t\t\t$_" for @out ;
  501 + print "\t\t\t\t$_" for @err ;
  502 + };
  503 + }
411 504  
412   - unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG);
413   - unlink ("$image.pdf") if (!$DEBUG);
414   - move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG);
415   - unlink ("$image") if (!$DEBUG);
416 505 }
417   - exit 0;
  506 + exit 1;
418 507 }
419 508 }
420 509  
... ... @@ -427,28 +516,51 @@ sub ocr {
427 516  
428 517 if (scalar @new_pages != $pages) {
429 518 print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
430   - syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG);
  519 + syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG);
431 520 unlink "$in_file.$host.tmp";
  521 + make_path ($error_path) if ( ! -d $error_path);
432 522 move ("$in_file.$host.processing", $error_file);
433   - exit (0);
  523 + exit (1);
434 524 }
435 525  
436   - # Merge resulting pdf pages to a single pdf
  526 + # Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output
437 527 make_path ($out_path) if ( ! -d $out_path);
438 528 unlink $out_file if ( -f $out_file );
439   - ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress");
  529 +
  530 + chdir (${tmpdir});
  531 + ($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\" pg_*-cpdf.pdf ");
440 532 if ($DEBUG) {
441 533 print "\t\t${out_file} -> $cmd: $exit\n";
442 534 print "\t\t\t$_" for @out ;
443 535 print "\t\t\t$_" for @err ;
444 536 };
  537 + if ($exit) {
  538 + unlink "$in_file.$host.tmp";
  539 + unlink $out_file;
  540 + make_path ($error_path) if ( ! -d $error_path);
  541 + move ("$in_file.$host.processing", $error_file);
  542 + print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
  543 + syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG);
  544 + exit (1);
  545 + }
  546 + chdir ("/");
  547 +
  548 + if (!copy (${tmp_file}, $out_file)) {
  549 + remove_tree ($tmpdir,{ error=> \my $dumb });
  550 + unlink ("$in_file.$host.tmp");
  551 + unlink $out_file;
  552 + make_path ($error_path) if ( ! -d $error_path);
  553 + move ("$in_file.$host.processing", $error_file);
  554 + print "Error: cannot copy temp file to $out_file \n" if $DEBUG;
  555 + syslog ("error","cannot copy temp file to $out_file") if !$DEBUG;
  556 + exit 1;
  557 + };
445 558  
446 559 make_path ($proc_path) if ( ! -d $proc_path);
447 560 unlink $proc_file if ( -f $proc_file );
448 561 move ("$in_file.$host.processing", $proc_file);
449 562 move ("${out_file}.tmp", ${out_file});
450 563  
451   -
452 564 # Remove temp dir
453 565 remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG);
454 566 unlink $tmp_file if (!$DEBUG);
... ... @@ -471,7 +583,7 @@ sub is_ocred {
471 583 }
472 584  
473 585 sub get_pages {
474   - my ($in_file, $w, $h, $r) = @_;
  586 + my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_;
475 587  
476 588 my $pages=0;
477 589 my $i=0;
... ... @@ -485,29 +597,35 @@ sub get_pages {
485 597 ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ );
486 598 ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ );
487 599 ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ );
488   - ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ );
  600 + ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ );
  601 + ($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ );
489 602 }
490 603  
491 604 return $pages;
492 605 }
493 606  
494 607 sub get_imgs {
495   - my ($in_file, $page_img, $w, $h, $t) = @_;
496   - my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc);
  608 + my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_;
  609 + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi );
497 610  
498 611 my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\"");
  612 + $i = 0;
499 613  
500 614 foreach my $line (@lines) {
501 615 chomp $line;
502 616 $line =~ s/^ {1,}//;
503   - if ( $line =~ /image|mask/ ) {
504   - ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line;
  617 + if ( $line !~ /^page|^----/ ) {
  618 + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line;
505 619 @$page_img[$page-1]=$i;
506 620 @$w[$page-1] = $width;
507 621 @$h[$page-1] = $height;
508 622 @$t[$page-1] = "rgb"; # Default is color
509   - @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]);
510 623 @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]);
  624 + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]);
  625 + @$t[$page-1] = ( $type eq "stencil" ? $type : @$t[$page-1]);
  626 + @$t[$page-1] = ( $enc eq "image" ? $enc : @$t[$page-1]);
  627 + @$x_ppi[$page-1] = $xppi;
  628 + @$y_ppi[$page-1] = $yppi;
511 629 }
512 630 }
513 631 return $i+1;
... ... @@ -542,6 +660,19 @@ sub get_res {
542 660 return ($res_x,$res_y);
543 661 }
544 662  
  663 +sub get_sign {
  664 + my ($in_file) = @_;
  665 + my @lines = `${PDFSIG} \"${in_file}\" 2>/dev/null`;
  666 +
  667 + foreach (@lines) {
  668 + chomp;
  669 + if ( $_ =~ /^Signature/ ) {
  670 + return 1;
  671 + }
  672 + }
  673 + return 0;
  674 +}
  675 +
545 676 sub is_locked_ex {
546 677 my ($path) = @_;
547 678  
... ...
workflow.pdf
No preview for this file type
workflow.vsd
No preview for this file type