Commit f02dd8e43c99cc7ed8312d8ace23cc29f208aebb

Authored by Nei Jobson da Costa Carneiro
2 parents 78ec197b a35873e6
Exists in master

Merge branch 'Pre_versao_2.0' into 'master'

Pre versao 2.0 final para gerar Tag 2.0

Final

See merge request !4
Dockerfile 0 → 100644
... ... @@ -0,0 +1,103 @@
  1 +
  2 +FROM ubuntu:14.04
  3 +
  4 +# Cópia de arquivos do projeto OCR-SERVER
  5 +COPY usr/local/bin/ocr /usr/local/bin/ocr
  6 +COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr
  7 +COPY entrypoint.sh /entrypoint.sh
  8 +
  9 +WORKDIR /tmp
  10 +
  11 +# Instalação dos pacotes pré-requisitos do ocr-server 2
  12 +RUN apt-get -y update && \
  13 + apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \
  14 + curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \
  15 + zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \
  16 + wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \
  17 + libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \
  18 + libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \
  19 + rsyslog libsys-syslog-perl && \
  20 + apt-get -y clean all
  21 +
  22 +RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \
  23 + dpkg -i mscorefonts.deb && \
  24 + rm mscorefonts.deb
  25 +
  26 +# Instalação do Perl 5.1 e demais módulos
  27 +RUN perl -MCPAN -e 'install File::Touch'
  28 +RUN perl -MCPAN -e 'install File::Find::Rule;'
  29 +RUN perl -MCPAN -e 'install File::Touch;'
  30 +RUN perl -MCPAN -e 'install Sys::Syslog;'
  31 +RUN perl -MCPAN -e 'install IPC::Open3;'
  32 +RUN perl -MCPAN -e 'install IO::Select;'
  33 +
  34 +# Tesseract-ocr 3.05, com dicionários inglês e português
  35 +# Bibliotecas para o Tesseract: Leptonica
  36 +RUN git clone https://github.com/DanBloomberg/leptonica.git && \
  37 + cd leptonica && ./autobuild && ./configure && make all install && \
  38 + rm -rf ../leptonica
  39 +
  40 +# Bibliotecas para o Tesseract: Libav
  41 +RUN git clone https://github.com/libav/libav.git && \
  42 + export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \
  43 + cd libav && ./configure --enable-sram && make all install && \
  44 + rm -rf ../libav
  45 +
  46 +# Tesseract 3.05.01
  47 +RUN git clone https://github.com/tesseract-ocr/tesseract.git && \
  48 + cd tesseract && ./autogen.sh && ./configure && make all install && \
  49 + rm -rf ../tesseract
  50 +
  51 +RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \
  52 + wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \
  53 + wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata
  54 +
  55 +# Poppler 0.56
  56 +RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \
  57 + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && \
  58 + rm -rf ../poppler
  59 +
  60 +# pdftk, versão 2.02 ou superior
  61 +RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \
  62 + unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \
  63 + cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \
  64 + rm -rf ../pdftk-2.02-dist
  65 +
  66 +# Ghostscript 9.18 ou superior
  67 +RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \
  68 + tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \
  69 + cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \
  70 + rm -rf ../ghostscript-9.18
  71 +
  72 +# CPDF Intel OS X v 2.2
  73 +RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \
  74 + cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin
  75 +
  76 +# Atualização das configurações do ld
  77 +RUN ldconfig
  78 +
  79 +RUN useradd -m ocr
  80 +
  81 +RUN chmod +x /usr/local/bin/ocr && \
  82 + chmod +x /etc/init.d/ocr && \
  83 + update-rc.d ocr defaults
  84 +
  85 +RUN mkdir /var/ocr-server/ && \
  86 + mkdir -p /var/ocr-server/Entrada && \
  87 + mkdir -p /var/ocr-server/Saida && \
  88 + mkdir -p /var/ocr-server/Originais_Processados && \
  89 + mkdir -p /var/ocr-server/Erro && \
  90 + chmod +x /entrypoint.sh
  91 +
  92 +RUN mkdir -p /tmp/ocr_dev/ && \
  93 + mkdir -p /tmp/ocr_dev/Entrada && \
  94 + mkdir -p /tmp/ocr_dev/Saida && \
  95 + mkdir -p /tmp/ocr_dev/Originais_Processados && \
  96 + mkdir -p /tmp/ocr_dev/Erro && \
  97 + chmod -R 777 /tmp/ocr_dev
  98 +
  99 +WORKDIR /
  100 +
  101 +VOLUME /var/ocr-server/
  102 +
  103 +CMD ["bash", "/entrypoint.sh"]
0 104 \ No newline at end of file
... ...
INSTALL.txt
... ... @@ -1,202 +0,0 @@
1   -# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees
2   -#
3   -# This script monitors a set of input directories for PDF files
4   -# once a new file is detected, it is processes through tesseract OCR
5   -# in order to generate a new file with a hidden searchable text layer
6   -#
7   -# It may be distributed under the conditions of the LGPL v2.1 license.
8   -#
9   -# Author: Guilherme Chehab
10   -#
11   -# Version History:
12   -# 0.1 Initial single server version
13   -# 0.2 Check if page already has the html hidden layer, if so, ignore it
14   -# 0.3 Solved issues about various image enconding types
15   -# 0.4 Added a postnormalization step to ensure all output pdf pages have
16   -# the same size and orientations as the original files
17   -# 0.5 Used input file renaming as a way to sync multiple parallel instances,
18   -# that way, it is minimized the risk of same file being OCRed multiple times.
19   -# 0.6 Added a default handler for unknown image encoding using jpeg encoding
20   -# 0.7 Solved an issue with files with more than 1000 pages
21   -# 1.0 First release version
22   -# 1.0.1 Solving error when file has no images
23   -# 1.0.2 Fix bug when counting cores for AMD processors
24   -# 1.0.3 Added better image type detection
25   -# 1.0.4 Fix: added ubuntu init script
26   -# 1.0.4b Centos 6.9
27   -#
28   -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
29   -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
30   -# diferently but does not treat it adequately
31   -# - Review poppler and cpdf install instructions
32   -# - Add better handling of vectorized and non scanned pdf files
33   -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
34   -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
35   -#
36   -# Check software requirements on the comments bellow
37   -#
38   -# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables
39   -#
40   -#
41   -# O servidor OCR depende dos seguintes componentes:
42   -# - Perl 5.10.1, com seguintes módulos:
43   -# - File::Find::Rule
44   -# - File::Basename
45   -# - File::Copy
46   -# - File::Path
47   -# - File::Touch
48   -# - Sys::Syslog
49   -# - Sys::Hostname
50   -# - IPC::Open3
51   -# - IO::Select
52   -# - POSIX
53   -# - Tesseract-ocr 3.05, com dicionários inglês e português
54   -# - Pdftk 2.02
55   -# - Poppler-utils 0.42.0
56   -# - Cpdf 2.1
57   -# - ImageMagick 6.7.2-7
58   -#
59   -# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
60   -# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
61   -# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
62   -#
63   -## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
64   -#
65   -# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
66   -#
67   -# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script
68   -# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
69   -# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
70   -# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
71   -# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
72   -# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
73   -# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
74   -#
75   -# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
76   -#
77   -#
78   -# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root)
79   -#
80   -#
81   -# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS
82   -#
83   -# RedHat 6.7 e Centos 6.9:
84   -yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
85   -yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel
86   -cd /tmp
87   -wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
88   -rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
89   -rm -f msttcore-fonts-2.0-3.noarch.rpm
90   -
91   -# Centos 6.9
92   -# \_ autoconf-archive
93   -wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
94   -rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
95   -rm autoconf-archive-2012.04.07-7.3.noarch.rpm
96   -# \_ GCC 4.8
97   -wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
98   -yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
99   -
100   -# Ubuntu 14.04 Server:
101   -apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14
102   -apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev
103   -apt-get install ttf-mscorefonts-installer
104   -
105   -# Ambas plataformas:
106   -cd /usr/local/src
107   -
108   -for i in \
109   - https://github.com/tesseract-ocr/langdata.git \
110   - https://github.com/DanBloomberg/leptonica.git \
111   - https://github.com/libav/libav.git \
112   - https://github.com/tesseract-ocr/tessdata.git \
113   - https://github.com/tesseract-ocr/tesseract.git \
114   - git://git.freedesktop.org/git/poppler/poppler.git \
115   - git://git.freedesktop.org/git/poppler/test.git \
116   - https://github.com/Flameeyes/unpaper.git \
117   - https://github.com/ocaml/ocaml.git \
118   - https://gitlab.camlcity.org/gerd/lib-findlib.git \
119   - https://github.com/johnwhitington/camlpdf.git \
120   - https://github.com/johnwhitington/cpdf-source.git \
121   -; do git clone $i; done
122   -
123   -wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
124   -unzip pdftk-2.02-src.zip
125   -rm -f pdftk-2.02-src.zip
126   -
127   -# pdftk, versão 2.02 ou superior
128   -cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
129   -
130   -# Centos 6.9
131   -# \_ Cria um novo shell usando o GCC 4.8 por default
132   -scl enable devtoolset-2 bash
133   -
134   -# Tesseract, versão 3.05-dev ou superior
135   -# Bibliotecas para o Tesseract: Leptonica e Libav
136   -cd leptonica && ./autobuild && ./configure && make all install && cd ..
137   -
138   -# Para compilação do Tesseract após a compilação do leptonica
139   -export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
140   -
141   -cd libav && ./configure --enable-sram && make all install && cd ..
142   -
143   -# Tesseract
144   -cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
145   -cp -avR tessdata/* /usr/local/share/tessdata/
146   -
147   -# cpdf, versão 2.1 ou superior
148   -cd ocaml && ./configure && make world.opt && make install && cd ..
149   -mkdir -p /usr/local/man/man5
150   -# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente
151   -cd lib-findlib && ./configure && make all && make install && cd ..
152   -cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
153   -cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
154   -
155   -# poppler-utils, versão 0.42.0 ou superior
156   -cd poppler && ./autogen.sh && ./configure && make all install && cd ..
157   -
158   -# Centos 6.9
159   -# \_ Termina o shell usando o GCC 4.8 por default
160   -exit
161   -
162   -# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root)
163   -
164   -## Comandos adicionais para configuração do módulo:
165   -
166   -# Criação do usuário
167   -adduser ocr
168   -
169   -# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
170   -cp ./usr/local/bin/ocr /usr/local/bin
171   -
172   -# Auto start (RedHat 6.7 e CentOs 6.9)
173   -cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr
174   -mv /etc
175   -chkconfig --add ocr
176   -chkconfig --level 2345 ocr on
177   -
178   -# Auto start (Ubuntu 14.04)
179   -cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
180   -update-rd.d ocr defaults
181   -
182   -# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
183   -cd /home/ocr
184   -tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
185   -su
186   -
187   -# Copie o pacote para os outros servidores e extraia com:
188   -cd /
189   -tar xovzf pkg-ocr.tgz
190   -
191   -# Instalando pré-requisitos RUNTIME em servidores adicionais
192   -
193   -# Redhat 6.7 e CentOS 6.9
194   -yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp
195   -yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext
196   -
197   -# Ubuntu 14.04
198   -apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14
199   -apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0
200   -
201   -# Inicie o serviço com
202   -service ocr start
README.md 0 → 100644
... ... @@ -0,0 +1,260 @@
  1 +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees
  2 +
  3 +This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer
  4 +
  5 +It may be distributed under the conditions of the LGPL v2.1 license.
  6 +
  7 +Author: Guilherme Chehab
  8 +
  9 +## Version History:
  10 + - 0.1
  11 + - Initial single server version
  12 + - 0.2
  13 + - Check if page already has the html hidden layer, if so, ignore it
  14 + - 0.3
  15 + - Solved issues about various image enconding types
  16 + - 0.4
  17 + - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files
  18 + - 0.5
  19 + - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times.
  20 + - 0.6
  21 + - Added a default handler for unknown image encoding using jpeg encoding
  22 + - 0.7
  23 + - Solved an issue with files with more than 1000 pages
  24 + - 1.0
  25 + - First release version
  26 + - 1.0.1 Solving error when file has no images
  27 + - 1.0.2 Fix bug when counting cores for AMD processors
  28 + - 1.0.3 Added better image type detection
  29 + - 1.0.4 Fix: added ubuntu init script
  30 + - 1.0.4b Add Centos 6.9 install instructions
  31 + - 2.0
  32 + - PDF/A output, and better compression with ghostscript
  33 + - Rewritten image extration, processing and transformations process
  34 + - Check if input file is signed, in this case, does not change the file contents
  35 + - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
  36 + - Use operating system packges by default
  37 + - Changed paths from external programs, instead of using full paths, uses first match from $PATH
  38 + - Check existence of external programs on path before running
  39 + - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
  40 + - Fix: create subpaths on error folder
  41 + - Fix: trying to reduce overhead on temporary folder
  42 +
  43 +## TODO:
  44 + - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
  45 + - Review poppler and cpdf install instructions
  46 + - Add better handling of vectorized and non scanned pdf files
  47 + - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers
  48 + - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W.
  49 + - Move all parameters to config file
  50 + - Add some job control web interface
  51 + - Add end user interface to submit files through web
  52 + - Add check external programs version requirements before running
  53 +
  54 +## BUGS:
  55 + - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages
  56 +
  57 +## Requirements:
  58 + - Perl 5.10.1, com seguintes módulos:
  59 + - File::Find::Rule
  60 + - File::Basename
  61 + - File::Copy
  62 + - File::Path
  63 + - File::Touch
  64 + - Sys::Syslog
  65 + - Sys::Hostname
  66 + - IPC::Open3
  67 + - IO::Select
  68 + - POSIX
  69 + - Tesseract-ocr 3.05, com dicionários inglês e português
  70 + - Pdftk 2.02
  71 + - Poppler-utils 0.42.0
  72 + - Cpdf 2.1
  73 + - ImageMagick 6.7.2-7
  74 + - Ghostcript 9.18
  75 +
  76 +Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema
  77 +
  78 +Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento.
  79 +
  80 +Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes.
  81 +
  82 +ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado.
  83 +
  84 +### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr':
  85 +
  86 +- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script
  87 +- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro
  88 +- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2)
  89 +- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs)
  90 +
  91 +Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS.
  92 +
  93 +Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos.
  94 +
  95 +A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página.
  96 +
  97 +Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS.
  98 +
  99 +
  100 +# Container Docker
  101 +
  102 + O OCR-Server também está disponível como um container Docker, permitindo o rápido provisionamento da solução em ambiente de produção. Todos os procedimento para construção da imagem do container podem ser encontrados no arquivo Dockerfile.
  103 +
  104 + Para execução do serviço, basta que o docker instalado no servidor e executar o seguinte comando:
  105 +
  106 + docker run --name <NOME_CONTAINER> -d -v <DIRETORIO_BASE>:/var/ocr-server guilhermeadc/ocr-server
  107 +
  108 + Onde:
  109 + --name : Nome atribuído à instância do container. Ex: ocr-server
  110 + -d : Indicação executar o container em background
  111 + -v : Diretório de compartilhamento entre o servidor host e o container.
  112 + O parâmetro <DIRETORIO_BASE> deve ser substituído pelo diretório base para busca de arquivos.
  113 +
  114 + Para vistualizar os logs de processamento do serviço, basta executar o seguinte comando:
  115 + docker logs <NOME_CONTAINER>
  116 +
  117 +
  118 +# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root)
  119 +
  120 +Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial)
  121 +são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada
  122 +
  123 +Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries
  124 +
  125 +## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS
  126 +
  127 + # RedHat 6.7 e Centos 6.9:
  128 + yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip
  129 + yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel
  130 + cd /tmp
  131 + wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm
  132 + rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm
  133 + rm -f msttcore-fonts-2.0-3.noarch.rpm
  134 +
  135 + # Centos 6.9
  136 + # \_ autoconf-archive
  137 + wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm
  138 + rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm
  139 + rm autoconf-archive-2012.04.07-7.3.noarch.rpm
  140 + # \_ GCC 4.8
  141 + wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo
  142 + yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj
  143 +
  144 + # Ubuntu 14.04 Server:
  145 + apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14
  146 + apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev
  147 + apt-get install ttf-mscorefonts-installer
  148 +
  149 + # Ambas plataformas:
  150 + cd /usr/local/src
  151 +
  152 + for i in \
  153 + https://github.com/tesseract-ocr/langdata.git \
  154 + https://github.com/DanBloomberg/leptonica.git \
  155 + https://github.com/libav/libav.git \
  156 + https://github.com/tesseract-ocr/tessdata.git \
  157 + https://github.com/tesseract-ocr/tesseract.git \
  158 + git://git.freedesktop.org/git/poppler/poppler.git \
  159 + git://git.freedesktop.org/git/poppler/test.git \
  160 + https://github.com/Flameeyes/unpaper.git \
  161 + https://github.com/ocaml/ocaml.git \
  162 + https://gitlab.camlcity.org/gerd/lib-findlib.git \
  163 + https://github.com/johnwhitington/camlpdf.git \
  164 + https://github.com/johnwhitington/cpdf-source.git \
  165 + http://git.ghostscript.com/ghostpdl.git \
  166 + ; do git clone $i; done
  167 +
  168 + wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip
  169 + unzip pdftk-2.02-src.zip
  170 + rm -f pdftk-2.02-src.zip
  171 +
  172 + # pdftk, versão 2.02 ou superior
  173 + cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../..
  174 +
  175 + # Ghostscript 9.18 ou superior
  176 + #wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.21.tar.gz
  177 + #tar xvozf ghostscript-9.21.tar.gz
  178 + #rm -f ghostscript-9.21.tar.gz
  179 + #cd ghostscript-9.21
  180 + cd ghostpdl
  181 + ./autogen.sh; ./configure
  182 + make all install
  183 + cd ..
  184 +
  185 + # Centos 6.9
  186 + # \_ Cria um novo shell usando o GCC 4.8 por default
  187 + scl enable devtoolset-2 bash
  188 +
  189 + # Tesseract, versão 3.05-dev ou superior
  190 + # Bibliotecas para o Tesseract: Leptonica e Libav
  191 + cd leptonica && ./autobuild && ./configure && make all install && cd ..
  192 +
  193 + # Para compilação do Tesseract após a compilação do leptonica
  194 + export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/
  195 +
  196 + cd libav && ./configure --enable-sram && make all install && cd ..
  197 +
  198 + # Tesseract
  199 + cd tesseract && ./autogen.sh && ./configure && make all install && cd ..
  200 + cp -avR tessdata/* /usr/local/share/tessdata/
  201 +
  202 + # cpdf, versão 2.1 ou superior
  203 + cd ocaml && ./configure && make world.opt && make install && cd ..
  204 + mkdir -p /usr/local/man/man5
  205 + # lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente
  206 + cd lib-findlib && ./configure && make all && make install && cd ..
  207 + cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd ..
  208 + cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd ..
  209 +
  210 + # poppler-utils, versão 0.42.0 ou superior
  211 + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd ..
  212 +
  213 + # Centos 6.9
  214 + # \_ Termina o shell usando o GCC 4.8 por default
  215 + exit
  216 +
  217 +
  218 +## Comandos adicionais para configuração do módulo:
  219 +
  220 + # Criação do usuário
  221 + adduser ocr
  222 +
  223 + # Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional
  224 + cp ./usr/local/bin/ocr /usr/local/bin
  225 +
  226 + # Auto start (RedHat 6.7 e CentOs 6.9)
  227 + cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr
  228 + mv /etc
  229 + chkconfig --add ocr
  230 + chkconfig --level 2345 ocr on
  231 +
  232 + # Auto start (Ubuntu 14.04)
  233 + cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr
  234 + update-rd.d ocr defaults
  235 +
  236 + # Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações
  237 + cd /home/ocr
  238 + tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr
  239 + su
  240 +
  241 +# INSTALAÇÃO (obs.: os comandos devem ser executados como root)
  242 + # Criação do usuário
  243 + adduser ocr
  244 +
  245 + # Copie o pacote para os outros servidores e extraia com:
  246 + cd /
  247 + tar xovzf pkg-ocr.tgz
  248 +
  249 + # Instalando pré-requisitos RUNTIME em servidores adicionais
  250 +
  251 + # Redhat 6.7 e CentOS 6.9
  252 + yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript
  253 + yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext
  254 +
  255 + # Ubuntu 14.04
  256 + apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14
  257 + apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript
  258 +
  259 +# Inicie o serviço com
  260 + service ocr start
... ...
entrypoint.sh 0 → 100644
... ... @@ -0,0 +1,17 @@
  1 +#!/usr/bin/env bash
  2 +
  3 +# Inicializa serviço de log
  4 +/etc/init.d/rsyslog start
  5 +
  6 +# Cria estrutura de pastas para monitoramento de arquivos
  7 +mkdir -p /var/ocr-server/
  8 +mkdir -p /var/ocr-server/Entrada
  9 +mkdir -p /var/ocr-server/Saida
  10 +mkdir -p /var/ocr-server/Originais_Processados
  11 +mkdir -p /var/ocr-server/Erro
  12 +chmod -R 777 /var/ocr-server
  13 +
  14 +# Iniciar serviço do OCR-Server
  15 +service ocr start
  16 +
  17 +tail -f /var/log/syslog
0 18 \ No newline at end of file
... ...
usr/local/bin/ocr
1   -#! /usr/bin/perl -w
  1 +#!/usr/bin/perl -w
2 2 #
3   -# OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes
  3 +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes
4 4 #
5 5 # This script monitors a set of input directories for PDF files
6 6 # once a new file is detected, it is processes through tesseract OCR
... ... @@ -24,15 +24,38 @@
24 24 # 1.0.1 Solving error when file has no images
25 25 # 1.0.2 Fix bug when counting cores for AMD processors
26 26 # 1.0.3 Added better image type detection
27   -# 1.0.4 Fix: added ubuntu init script
  27 +# 1.0.4 Fix: added ubuntu init script
  28 +# 1.0.4b Add Centos 6.9 install instructions
  29 +# 2.0 PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is
  30 +# strongly recomended
  31 +# Rewritten image extration, processing and transformations process
  32 +# Check if input file is signed, in this case, does not change the file contents
  33 +# Added '-oem 0' option to tesseract (force legacy mode on tesseract 4)
  34 +# Use operating system packges by default
  35 +# Changed paths from external programs, instead of using full paths, uses first match from $PATH
  36 +# Check existence of external programs on path before running
  37 +# Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings
  38 +# Fix: create subpaths on error folder
  39 +# Fix: trying to reduce overhead on temporary folder
28 40 #
29 41 # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
30   -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
31   -# diferently but does not treat it adequately
  42 +# would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them
  43 +# diferently but does not treat it adequately -- shall require better pdf´s internal structure handling
32 44 # - Review poppler and cpdf install instructions
33 45 # - Add better handling of vectorized and non scanned pdf files
34   -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
35   -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
  46 +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current
  47 +# scalling, cropping and rotation handlers
  48 +# - Check mean saturation for additional colored images detection and automatically convert to B&W if possible --
  49 +# added function to analyse image color histogram -> just need to add option to convert it to B&W.
  50 +# - Move all parameters to config file
  51 +# - Add some job control web interface
  52 +# - Add end user interface to submit files through web
  53 +# - Add check external programs version requirements before running
  54 +#
  55 +# BUGS: - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than
  56 +# original, this is due to using pdftoppm instead of pdfimages
  57 +# - Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions
  58 +# increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server
36 59 #
37 60 # Check software requirements on the comments bellow
38 61 #
... ... @@ -55,7 +78,7 @@ use IPC::Open3;
55 78 use IO::Select;
56 79  
57 80 my $DEBUG = 0;
58   -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`);
  81 +my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`);
59 82 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ;
60 83  
61 84 my $USER = 'ocr';
... ... @@ -63,39 +86,48 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca
63 86  
64 87 # Command dependencies
65 88  
66   -# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher
67   -my $TESSERACT = '/usr/local/bin/tesseract -l por+eng';
  89 +# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended
  90 +my $TESSERACT = 'tesseract --oem 0'; # if Tesseract => 4.0
  91 +#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0
68 92  
69 93 # Depends on pdftk 2.02 or higher
70   -my $PDFTK = '/usr/local/bin/pdftk';
  94 +my $PDFTK = 'pdftk';
71 95  
72 96 # Depends on poppler-utils 0.42.0 or higher
73   -#my $PDINFO = '/usr/local/bin/pdfinfo';
74   -my $PDFFONTS = '/usr/local/bin/pdffonts';
75   -my $PDFIMAGES = '/usr/local/bin/pdfimages';
76   -my $PDFTOPPM = '/usr/local/bin/pdftoppm';
  97 +my $PDFFONTS = 'pdffonts';
  98 +my $PDFIMAGES = 'pdfimages';
  99 +my $PDFTOPPM = 'pdftoppm';
  100 +my $PDFUNITE = 'pdfunite';
  101 +my $PDFSIG = 'pdfsig';
77 102  
78 103 # Depends on cpdf 2.1 or higher
79   -my $CPDF = '/usr/local/bin/cpdf';
  104 +my $CPDF = 'cpdf';
  105 +
  106 +# Depends on Ghostscript 9.18
  107 +my $GS = 'gs';
80 108  
81 109 ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner
82   -my $CONVERT = '/usr/bin/convert';
  110 +my $CONVERT = 'convert';
83 111  
84 112 # If it is needed further filtering
85 113 #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';
86 114  
87   -my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/',
88   - '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' );
  115 +#my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/',
  116 +# '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' );
  117 +
  118 +my @BASE_DIRS = ('/var/ocr-server/');
89 119  
90 120 my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' );
91 121  
92 122 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2);
93   -%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG);
  123 +%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2);
94 124  
95 125 # Safeguard im case of cpuinfo has not identified correctly the number of CPUs
96 126 $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS;
97 127  
98   -$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin';
  128 +$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin';
  129 +$ENV{'IFS'} = '\t\n';
  130 +
99 131 my ($host) = split/\./,hostname;
100 132  
101 133 use vars qw/*name *dir *prune/;
... ... @@ -107,14 +139,15 @@ sub main;
107 139 sub get_pages;
108 140 sub get_rotation;
109 141 sub get_res;
110   -sub is_ocred;
111 142 sub is_locked_ex;
112 143  
113 144  
114 145 my $expr = 'use POSIX qw(setsid)';
115 146  
116 147 my ($dumb1, $dumb2, $uid) = getpwnam ($USER);
117   -setuid ($uid) or warn "Cant set uid $uid";
  148 +if (defined $uid) {
  149 + setuid ($uid) or warn "Cant set uid $uid";
  150 +}
118 151  
119 152 $SIG{__DIE__} = 'DEFAULT';
120 153 $SIG{__WARN__} = \&die_when_called;
... ... @@ -126,6 +159,11 @@ if ($@) {
126 159 chdir('/') or die "$0: cannot chdir '/': $!\n";
127 160 open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n";
128 161  
  162 +foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) {
  163 + die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0);
  164 +}
  165 +
  166 +
129 167 foreach my $DIR (@BASE_DIRS) {
130 168  
131 169 defined(my $pid = fork) or die "$0: cannot fork: $!\n";
... ... @@ -135,7 +173,7 @@ foreach my $DIR (@BASE_DIRS) {
135 173 main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR});
136 174 exit 0;
137 175 last;
138   - }
  176 + }
139 177 }
140 178  
141 179 exit 0;
... ... @@ -157,7 +195,7 @@ sub main {
157 195 # remove .tmp file
158 196 unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) );
159 197  
160   - # Rename files that were in 'processig' back
  198 + # Rename files that were in 'processing' state back
161 199 foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) {
162 200 my $old_name = $file;
163 201 $old_name =~ s/\.${host}\.processing$//g;
... ... @@ -177,12 +215,14 @@ sub main {
177 215 # Main loop
178 216 while ( 1 ) {
179 217 select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced
  218 +
180 219 $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} ));
181 220 print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in;
182 221 $count = scalar keys %files_in;
183   - foreach my $file (keys %files_in) {
184 222  
185   - next if ( glob ("$file.*.tmp"));
  223 + foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) {
  224 +
  225 + next if ( glob ("\"$file.*.tmp\""));
186 226  
187 227 select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds
188 228 next if (!defined $files_in{$file}); # continue only if it is still valid
... ... @@ -255,7 +295,7 @@ sub ocr {
255 295 remove_tree ($tmpdir,{ error=> \my $dumb });
256 296 unlink ("$in_file.$host.tmp");
257 297 move ( "$in_file.$host.processing", $in_file);
258   - exit 0;
  298 + exit 1;
259 299 };
260 300  
261 301 my $out_path = $in_path;
... ... @@ -271,7 +311,7 @@ sub ocr {
271 311 my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: "");
272 312  
273 313 print "\twritting to $out_file\n" if $DEBUG;
274   -
  314 +
275 315 my $stime = time;
276 316 my %pids;
277 317  
... ... @@ -291,8 +331,26 @@ sub ocr {
291 331 remove_tree ($tmpdir,{ error=> \my $dumb });
292 332 unlink ("$in_file.$host.tmp");
293 333 move ( "$in_file.$host.processing", $in_file);
  334 + print "Error: cannot copy $in_file to temp dir \n" if $DEBUG;
  335 + syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG;
  336 + exit 1;
294 337 };
295 338  
  339 + # Check if file was signed
  340 + if (get_sign($tmp_file)) {
  341 + if (!copy ("$in_file.$host.processing", $proc_file)) {
  342 + remove_tree ($tmpdir,{ error=> \my $dumb });
  343 + unlink ("$in_file.$host.tmp");
  344 + move ( "$in_file.$host.processing", $in_file);
  345 + };
  346 + move ("$in_file.$host.processing", $out_file);
  347 + unlink ("$in_file.$host.tmp");
  348 + print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG;
  349 + syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG;
  350 +
  351 + exit 0;
  352 + }
  353 +
296 354 # Extract pages
297 355 ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf");
298 356 if ($DEBUG) {
... ... @@ -301,12 +359,13 @@ sub ocr {
301 359 print "\t\t\t$_" for @err ;
302 360 };
303 361  
  362 + my ($pages, @pg_w, @pg_h, @pg_r, @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2);
  363 + $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2);
304 364  
305   - my ($pages, @pg_w, @pg_h, @pg_r);
306   - $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r);
  365 + my ($imgs,@page_img, @img_w, @img_h, @img_t, @img_xppi, @img_yppi);
  366 + $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi);
307 367  
308   - my ($imgs,@page_img, @img_w, @img_h, @img_t);
309   - $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t);
  368 + unlink ($tmp_file) if (!$DEBUG);
310 369  
311 370 for ( my $i=0; $i< $pages; $i++ ) {
312 371 my $pg = sprintf ("pg_%06d", $i+1);
... ... @@ -333,25 +392,29 @@ sub ocr {
333 392 if (! defined $img_t[$i] ) {
334 393 move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
335 394 print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG;
336   - exit 0;
  395 + exit -1;
337 396 }
338 397  
339   - print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG;
  398 + print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG;
  399 + print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG);
  400 + print "\n" if ($DEBUG);
340 401  
  402 + # Extract images from page, since 2.0 uses png lossless format regardless of original format or depth
341 403 undef $cmd;
342 404  
343   - if ($img_t[$i] eq "gray") {
344   - $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
  405 + # Use PDFIMAGES and JPEG by default
  406 + $cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
  407 +
  408 + if ($img_t[$i] eq "stencil") {
  409 + $cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
345 410 }
346 411  
347   - if ($img_t[$i] eq "rgb") {
348   - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
349   - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM
  412 + if ($img_t[$i] eq "gray") {
  413 + $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
350 414 }
351 415  
352   - if (!defined $cmd) {
353   - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
354   - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM
  416 + if ($img_t[$i] !~ /gray|rgb|stencil/) {
  417 + $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}";
355 418 }
356 419  
357 420 ($exit,$cmd,@out,@err) = exec_cmd($cmd);
... ... @@ -362,7 +425,13 @@ sub ocr {
362 425 };
363 426  
364 427 # Process each resulting image for page pdf
365   - my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ;
  428 + my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ;
  429 +
  430 + if (scalar @images == 0) {
  431 + move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf");
  432 + print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG;
  433 + exit 0;
  434 + }
366 435  
367 436 foreach my $image (@images) {
368 437 print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG;
... ... @@ -378,43 +447,65 @@ sub ocr {
378 447 print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n";
379 448 }
380 449 }
381   -
382   - # Check if page was rotated
383   - if ($pg_r[$i]) {
384   - print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG;
385   - ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\"");
  450 +
  451 + # Check if page was rotated and extracted with pdftoppm
  452 + if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) {
  453 + print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG;
  454 + ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\"");
386 455 if ($DEBUG) {
387 456 print "\t\t\t${image} -> $cmd: $exit\n";
388 457 print "\t\t\t\t$_" for @out ;
389 458 print "\t\t\t\t$_" for @err ;
390 459 };
391 460 }
392   -
  461 +
393 462 # Filter ppm images, if needed
394 463  
395 464 # OCR ppm images to pdf pages
396   - ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf");
  465 + ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf");
397 466 if ($DEBUG) {
398 467 print "\t\t\t${image} -> $cmd: $exit\n";
399 468 print "\t\t\t\t$_" for @out ;
400 469 print "\t\t\t\t$_" for @err ;
401 470 };
  471 + unlink ("$image") if (!$DEBUG);
402 472  
403   - # Scale to fit pdf
404   - ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
  473 + # Scale, crop and rotate to fit pdf
  474 + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf");
405 475 if ($DEBUG) {
406 476 print "\t\t\t${image} -> $cmd: $exit\n";
407 477 print "\t\t\t\t$_" for @out ;
408 478 print "\t\t\t\t$_" for @err ;
409 479 };
  480 + unlink ("$image.pdf") if (!$DEBUG);
410 481  
  482 + if (defined $pg_crop_x1[$i]) {
  483 + # adjust cropbox
  484 + ($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = (
  485 + ($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]),
  486 + ($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]),
  487 + abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i])
  488 + );
  489 +
  490 + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
  491 + if ($DEBUG) {
  492 + print "\t\t\t${image} -> $cmd: $exit\n";
  493 + print "\t\t\t\t$_" for @out ;
  494 + print "\t\t\t\t$_" for @err ;
  495 + };
  496 + }
  497 +
  498 + if ($pg_r[$i]) {
  499 + ($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf");
  500 + if ($DEBUG) {
  501 + print "\t\t\t${image} -> $cmd: $exit\n";
  502 + print "\t\t\t\t$_" for @out ;
  503 + print "\t\t\t\t$_" for @err ;
  504 + };
  505 + }
411 506  
412   - unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG);
413   - unlink ("$image.pdf") if (!$DEBUG);
414   - move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG);
415   - unlink ("$image") if (!$DEBUG);
416 507 }
417   - exit 0;
  508 + exit 1;
418 509 }
419 510 }
420 511  
... ... @@ -427,28 +518,51 @@ sub ocr {
427 518  
428 519 if (scalar @new_pages != $pages) {
429 520 print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
430   - syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG);
  521 + syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG);
431 522 unlink "$in_file.$host.tmp";
  523 + make_path ($error_path) if ( ! -d $error_path);
432 524 move ("$in_file.$host.processing", $error_file);
433   - exit (0);
  525 + exit (1);
434 526 }
435 527  
436   - # Merge resulting pdf pages to a single pdf
  528 + # Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output
437 529 make_path ($out_path) if ( ! -d $out_path);
438 530 unlink $out_file if ( -f $out_file );
439   - ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress");
  531 +
  532 + chdir (${tmpdir});
  533 + ($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\" pg_*-cpdf.pdf ");
440 534 if ($DEBUG) {
441 535 print "\t\t${out_file} -> $cmd: $exit\n";
442 536 print "\t\t\t$_" for @out ;
443 537 print "\t\t\t$_" for @err ;
444 538 };
  539 + if ($exit) {
  540 + unlink "$in_file.$host.tmp";
  541 + unlink $out_file;
  542 + make_path ($error_path) if ( ! -d $error_path);
  543 + move ("$in_file.$host.processing", $error_file);
  544 + print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG);
  545 + syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG);
  546 + exit (1);
  547 + }
  548 + chdir ("/");
  549 +
  550 + if (!copy (${tmp_file}, $out_file)) {
  551 + remove_tree ($tmpdir,{ error=> \my $dumb });
  552 + unlink ("$in_file.$host.tmp");
  553 + unlink $out_file;
  554 + make_path ($error_path) if ( ! -d $error_path);
  555 + move ("$in_file.$host.processing", $error_file);
  556 + print "Error: cannot copy temp file to $out_file \n" if $DEBUG;
  557 + syslog ("error","cannot copy temp file to $out_file") if !$DEBUG;
  558 + exit 1;
  559 + };
445 560  
446 561 make_path ($proc_path) if ( ! -d $proc_path);
447 562 unlink $proc_file if ( -f $proc_file );
448 563 move ("$in_file.$host.processing", $proc_file);
449 564 move ("${out_file}.tmp", ${out_file});
450 565  
451   -
452 566 # Remove temp dir
453 567 remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG);
454 568 unlink $tmp_file if (!$DEBUG);
... ... @@ -471,7 +585,7 @@ sub is_ocred {
471 585 }
472 586  
473 587 sub get_pages {
474   - my ($in_file, $w, $h, $r) = @_;
  588 + my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_;
475 589  
476 590 my $pages=0;
477 591 my $i=0;
... ... @@ -485,29 +599,35 @@ sub get_pages {
485 599 ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ );
486 600 ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ );
487 601 ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ );
488   - ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ );
  602 + ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ );
  603 + ($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ );
489 604 }
490 605  
491 606 return $pages;
492 607 }
493 608  
494 609 sub get_imgs {
495   - my ($in_file, $page_img, $w, $h, $t) = @_;
496   - my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc);
  610 + my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_;
  611 + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi );
497 612  
498 613 my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\"");
  614 + $i = 0;
499 615  
500 616 foreach my $line (@lines) {
501 617 chomp $line;
502 618 $line =~ s/^ {1,}//;
503   - if ( $line =~ /image|mask/ ) {
504   - ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line;
  619 + if ( $line !~ /^page|^----/ ) {
  620 + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line;
505 621 @$page_img[$page-1]=$i;
506 622 @$w[$page-1] = $width;
507 623 @$h[$page-1] = $height;
508 624 @$t[$page-1] = "rgb"; # Default is color
509   - @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]);
510 625 @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]);
  626 + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]);
  627 + @$t[$page-1] = ( $type eq "stencil" ? $type : @$t[$page-1]);
  628 + @$t[$page-1] = ( $enc eq "image" ? $enc : @$t[$page-1]);
  629 + @$x_ppi[$page-1] = $xppi;
  630 + @$y_ppi[$page-1] = $yppi;
511 631 }
512 632 }
513 633 return $i+1;
... ... @@ -542,6 +662,19 @@ sub get_res {
542 662 return ($res_x,$res_y);
543 663 }
544 664  
  665 +sub get_sign {
  666 + my ($in_file) = @_;
  667 + my @lines = `${PDFSIG} \"${in_file}\" 2>/dev/null`;
  668 +
  669 + foreach (@lines) {
  670 + chomp;
  671 + if ( $_ =~ /^Signature/ ) {
  672 + return 1;
  673 + }
  674 + }
  675 + return 0;
  676 +}
  677 +
545 678 sub is_locked_ex {
546 679 my ($path) = @_;
547 680  
... ...
workflow.pdf
No preview for this file type
workflow.vsd
No preview for this file type