Commit f02dd8e43c99cc7ed8312d8ace23cc29f208aebb
Exists in
master
Merge branch 'Pre_versao_2.0' into 'master'
Pre versao 2.0 final para gerar Tag 2.0 Final See merge request !4
Showing
7 changed files
with
583 additions
and
272 deletions
Show diff stats
| ... | ... | @@ -0,0 +1,103 @@ |
| 1 | + | |
| 2 | +FROM ubuntu:14.04 | |
| 3 | + | |
| 4 | +# Cópia de arquivos do projeto OCR-SERVER | |
| 5 | +COPY usr/local/bin/ocr /usr/local/bin/ocr | |
| 6 | +COPY etc/init.d/ocr-ubuntu /etc/init.d/ocr | |
| 7 | +COPY entrypoint.sh /entrypoint.sh | |
| 8 | + | |
| 9 | +WORKDIR /tmp | |
| 10 | + | |
| 11 | +# Instalação dos pacotes pré-requisitos do ocr-server 2 | |
| 12 | +RUN apt-get -y update && \ | |
| 13 | + apt-get -y install build-essential cmake libtool yasm pkg-config subversion git libgcj14 apt-utils \ | |
| 14 | + curl libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev \ | |
| 15 | + zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libnss3-dev \ | |
| 16 | + wget cabextract xfonts-utils perl automake autoconf-archive libcurl4-gnutls-dev unzip libgcj14 \ | |
| 17 | + libfile-find-rule-perl libfile-find-rule-perl-perl imagemagick gettext unpaper libtiff5 libpng12-0 \ | |
| 18 | + libjpeg-turbo8 libpango1.0-0 libcairo2 fontconfig libwebp5 libfontconfig1 libgettextpo0 pkg-config gcc gcj-jdk \ | |
| 19 | + rsyslog libsys-syslog-perl && \ | |
| 20 | + apt-get -y clean all | |
| 21 | + | |
| 22 | +RUN wget -O mscorefonts.deb http://ftp.us.debian.org/debian/pool/contrib/m/msttcorefonts/ttf-mscorefonts-installer_3.4+nmu1_all.deb && \ | |
| 23 | + dpkg -i mscorefonts.deb && \ | |
| 24 | + rm mscorefonts.deb | |
| 25 | + | |
| 26 | +# Instalação do Perl 5.1 e demais módulos | |
| 27 | +RUN perl -MCPAN -e 'install File::Touch' | |
| 28 | +RUN perl -MCPAN -e 'install File::Find::Rule;' | |
| 29 | +RUN perl -MCPAN -e 'install File::Touch;' | |
| 30 | +RUN perl -MCPAN -e 'install Sys::Syslog;' | |
| 31 | +RUN perl -MCPAN -e 'install IPC::Open3;' | |
| 32 | +RUN perl -MCPAN -e 'install IO::Select;' | |
| 33 | + | |
| 34 | +# Tesseract-ocr 3.05, com dicionários inglês e português | |
| 35 | +# Bibliotecas para o Tesseract: Leptonica | |
| 36 | +RUN git clone https://github.com/DanBloomberg/leptonica.git && \ | |
| 37 | + cd leptonica && ./autobuild && ./configure && make all install && \ | |
| 38 | + rm -rf ../leptonica | |
| 39 | + | |
| 40 | +# Bibliotecas para o Tesseract: Libav | |
| 41 | +RUN git clone https://github.com/libav/libav.git && \ | |
| 42 | + export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ && \ | |
| 43 | + cd libav && ./configure --enable-sram && make all install && \ | |
| 44 | + rm -rf ../libav | |
| 45 | + | |
| 46 | +# Tesseract 3.05.01 | |
| 47 | +RUN git clone https://github.com/tesseract-ocr/tesseract.git && \ | |
| 48 | + cd tesseract && ./autogen.sh && ./configure && make all install && \ | |
| 49 | + rm -rf ../tesseract | |
| 50 | + | |
| 51 | +RUN wget https://github.com/tesseract-ocr/tessdata/blob/master/eng.traineddata?raw=true -O /usr/local/share/tessdata/eng.traineddata && \ | |
| 52 | + wget https://github.com/tesseract-ocr/tessdata/blob/master/por.traineddata?raw=true -O /usr/local/share/tessdata/por.traineddata && \ | |
| 53 | + wget https://github.com/tesseract-ocr/tessdata/blob/master/osd.traineddata?raw=true -O /usr/local/share/tessdata/osd.traineddata | |
| 54 | + | |
| 55 | +# Poppler 0.56 | |
| 56 | +RUN git clone -b poppler-0.56 https://anongit.freedesktop.org/git/poppler/poppler.git && \ | |
| 57 | + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && \ | |
| 58 | + rm -rf ../poppler | |
| 59 | + | |
| 60 | +# pdftk, versão 2.02 ou superior | |
| 61 | +RUN wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip && \ | |
| 62 | + unzip pdftk-2.02-src.zip && rm -f pdftk-2.02-src.zip && \ | |
| 63 | + cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && \ | |
| 64 | + rm -rf ../pdftk-2.02-dist | |
| 65 | + | |
| 66 | +# Ghostscript 9.18 ou superior | |
| 67 | +RUN wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.18.tar.gz && \ | |
| 68 | + tar xvozf ghostscript-9.18.tar.gz && rm -f ghostscript-9.18.tar.gz && \ | |
| 69 | + cd ghostscript-9.18 && ls && ./autogen.sh; ./configure && make all install && \ | |
| 70 | + rm -rf ../ghostscript-9.18 | |
| 71 | + | |
| 72 | +# CPDF Intel OS X v 2.2 | |
| 73 | +RUN git clone https://github.com/coherentgraphics/cpdf-binaries.git && \ | |
| 74 | + cp cpdf-binaries/Linux-Intel-64bit/cpdf /usr/bin | |
| 75 | + | |
| 76 | +# Atualização das configurações do ld | |
| 77 | +RUN ldconfig | |
| 78 | + | |
| 79 | +RUN useradd -m ocr | |
| 80 | + | |
| 81 | +RUN chmod +x /usr/local/bin/ocr && \ | |
| 82 | + chmod +x /etc/init.d/ocr && \ | |
| 83 | + update-rc.d ocr defaults | |
| 84 | + | |
| 85 | +RUN mkdir /var/ocr-server/ && \ | |
| 86 | + mkdir -p /var/ocr-server/Entrada && \ | |
| 87 | + mkdir -p /var/ocr-server/Saida && \ | |
| 88 | + mkdir -p /var/ocr-server/Originais_Processados && \ | |
| 89 | + mkdir -p /var/ocr-server/Erro && \ | |
| 90 | + chmod +x /entrypoint.sh | |
| 91 | + | |
| 92 | +RUN mkdir -p /tmp/ocr_dev/ && \ | |
| 93 | + mkdir -p /tmp/ocr_dev/Entrada && \ | |
| 94 | + mkdir -p /tmp/ocr_dev/Saida && \ | |
| 95 | + mkdir -p /tmp/ocr_dev/Originais_Processados && \ | |
| 96 | + mkdir -p /tmp/ocr_dev/Erro && \ | |
| 97 | + chmod -R 777 /tmp/ocr_dev | |
| 98 | + | |
| 99 | +WORKDIR / | |
| 100 | + | |
| 101 | +VOLUME /var/ocr-server/ | |
| 102 | + | |
| 103 | +CMD ["bash", "/entrypoint.sh"] | |
| 0 | 104 | \ No newline at end of file | ... | ... |
INSTALL.txt
| ... | ... | @@ -1,202 +0,0 @@ |
| 1 | -# OCR Server 1.0.4b - (c) Agencia Nacional de Telecomunicacoees | |
| 2 | -# | |
| 3 | -# This script monitors a set of input directories for PDF files | |
| 4 | -# once a new file is detected, it is processes through tesseract OCR | |
| 5 | -# in order to generate a new file with a hidden searchable text layer | |
| 6 | -# | |
| 7 | -# It may be distributed under the conditions of the LGPL v2.1 license. | |
| 8 | -# | |
| 9 | -# Author: Guilherme Chehab | |
| 10 | -# | |
| 11 | -# Version History: | |
| 12 | -# 0.1 Initial single server version | |
| 13 | -# 0.2 Check if page already has the html hidden layer, if so, ignore it | |
| 14 | -# 0.3 Solved issues about various image enconding types | |
| 15 | -# 0.4 Added a postnormalization step to ensure all output pdf pages have | |
| 16 | -# the same size and orientations as the original files | |
| 17 | -# 0.5 Used input file renaming as a way to sync multiple parallel instances, | |
| 18 | -# that way, it is minimized the risk of same file being OCRed multiple times. | |
| 19 | -# 0.6 Added a default handler for unknown image encoding using jpeg encoding | |
| 20 | -# 0.7 Solved an issue with files with more than 1000 pages | |
| 21 | -# 1.0 First release version | |
| 22 | -# 1.0.1 Solving error when file has no images | |
| 23 | -# 1.0.2 Fix bug when counting cores for AMD processors | |
| 24 | -# 1.0.3 Added better image type detection | |
| 25 | -# 1.0.4 Fix: added ubuntu init script | |
| 26 | -# 1.0.4b Centos 6.9 | |
| 27 | -# | |
| 28 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | |
| 29 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
| 30 | -# diferently but does not treat it adequately | |
| 31 | -# - Review poppler and cpdf install instructions | |
| 32 | -# - Add better handling of vectorized and non scanned pdf files | |
| 33 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
| 34 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
| 35 | -# | |
| 36 | -# Check software requirements on the comments bellow | |
| 37 | -# | |
| 38 | -# To configure input dirs change @BASE_DIRS and @SUB_DIRS variables | |
| 39 | -# | |
| 40 | -# | |
| 41 | -# O servidor OCR depende dos seguintes componentes: | |
| 42 | -# - Perl 5.10.1, com seguintes módulos: | |
| 43 | -# - File::Find::Rule | |
| 44 | -# - File::Basename | |
| 45 | -# - File::Copy | |
| 46 | -# - File::Path | |
| 47 | -# - File::Touch | |
| 48 | -# - Sys::Syslog | |
| 49 | -# - Sys::Hostname | |
| 50 | -# - IPC::Open3 | |
| 51 | -# - IO::Select | |
| 52 | -# - POSIX | |
| 53 | -# - Tesseract-ocr 3.05, com dicionários inglês e português | |
| 54 | -# - Pdftk 2.02 | |
| 55 | -# - Poppler-utils 0.42.0 | |
| 56 | -# - Cpdf 2.1 | |
| 57 | -# - ImageMagick 6.7.2-7 | |
| 58 | -# | |
| 59 | -# Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | |
| 60 | -# Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | |
| 61 | -# Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | |
| 62 | -# | |
| 63 | -## ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | |
| 64 | -# | |
| 65 | -# Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | |
| 66 | -# | |
| 67 | -# @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | |
| 68 | -# @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | |
| 69 | -# $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | |
| 70 | -# $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | |
| 71 | -# Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | |
| 72 | -# Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | |
| 73 | -# A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | |
| 74 | -# | |
| 75 | -# Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | |
| 76 | -# | |
| 77 | -# | |
| 78 | -# ----------------------- COMPILAÇÃO dos pré requisitos (obs.: os comandos de devem ser executados como root) | |
| 79 | -# | |
| 80 | -# | |
| 81 | -# Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | |
| 82 | -# | |
| 83 | -# RedHat 6.7 e Centos 6.9: | |
| 84 | -yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | |
| 85 | -yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel | |
| 86 | -cd /tmp | |
| 87 | -wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | |
| 88 | -rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | |
| 89 | -rm -f msttcore-fonts-2.0-3.noarch.rpm | |
| 90 | - | |
| 91 | -# Centos 6.9 | |
| 92 | -# \_ autoconf-archive | |
| 93 | -wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | |
| 94 | -rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | |
| 95 | -rm autoconf-archive-2012.04.07-7.3.noarch.rpm | |
| 96 | -# \_ GCC 4.8 | |
| 97 | -wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | |
| 98 | -yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | |
| 99 | - | |
| 100 | -# Ubuntu 14.04 Server: | |
| 101 | -apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | |
| 102 | -apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev | |
| 103 | -apt-get install ttf-mscorefonts-installer | |
| 104 | - | |
| 105 | -# Ambas plataformas: | |
| 106 | -cd /usr/local/src | |
| 107 | - | |
| 108 | -for i in \ | |
| 109 | - https://github.com/tesseract-ocr/langdata.git \ | |
| 110 | - https://github.com/DanBloomberg/leptonica.git \ | |
| 111 | - https://github.com/libav/libav.git \ | |
| 112 | - https://github.com/tesseract-ocr/tessdata.git \ | |
| 113 | - https://github.com/tesseract-ocr/tesseract.git \ | |
| 114 | - git://git.freedesktop.org/git/poppler/poppler.git \ | |
| 115 | - git://git.freedesktop.org/git/poppler/test.git \ | |
| 116 | - https://github.com/Flameeyes/unpaper.git \ | |
| 117 | - https://github.com/ocaml/ocaml.git \ | |
| 118 | - https://gitlab.camlcity.org/gerd/lib-findlib.git \ | |
| 119 | - https://github.com/johnwhitington/camlpdf.git \ | |
| 120 | - https://github.com/johnwhitington/cpdf-source.git \ | |
| 121 | -; do git clone $i; done | |
| 122 | - | |
| 123 | -wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | |
| 124 | -unzip pdftk-2.02-src.zip | |
| 125 | -rm -f pdftk-2.02-src.zip | |
| 126 | - | |
| 127 | -# pdftk, versão 2.02 ou superior | |
| 128 | -cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | |
| 129 | - | |
| 130 | -# Centos 6.9 | |
| 131 | -# \_ Cria um novo shell usando o GCC 4.8 por default | |
| 132 | -scl enable devtoolset-2 bash | |
| 133 | - | |
| 134 | -# Tesseract, versão 3.05-dev ou superior | |
| 135 | -# Bibliotecas para o Tesseract: Leptonica e Libav | |
| 136 | -cd leptonica && ./autobuild && ./configure && make all install && cd .. | |
| 137 | - | |
| 138 | -# Para compilação do Tesseract após a compilação do leptonica | |
| 139 | -export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | |
| 140 | - | |
| 141 | -cd libav && ./configure --enable-sram && make all install && cd .. | |
| 142 | - | |
| 143 | -# Tesseract | |
| 144 | -cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | |
| 145 | -cp -avR tessdata/* /usr/local/share/tessdata/ | |
| 146 | - | |
| 147 | -# cpdf, versão 2.1 ou superior | |
| 148 | -cd ocaml && ./configure && make world.opt && make install && cd .. | |
| 149 | -mkdir -p /usr/local/man/man5 | |
| 150 | -# lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | |
| 151 | -cd lib-findlib && ./configure && make all && make install && cd .. | |
| 152 | -cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | |
| 153 | -cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | |
| 154 | - | |
| 155 | -# poppler-utils, versão 0.42.0 ou superior | |
| 156 | -cd poppler && ./autogen.sh && ./configure && make all install && cd .. | |
| 157 | - | |
| 158 | -# Centos 6.9 | |
| 159 | -# \_ Termina o shell usando o GCC 4.8 por default | |
| 160 | -exit | |
| 161 | - | |
| 162 | -# ----------------------- INSTALAÇÃO (obs.: os comandos de devem ser executados como root) | |
| 163 | - | |
| 164 | -## Comandos adicionais para configuração do módulo: | |
| 165 | - | |
| 166 | -# Criação do usuário | |
| 167 | -adduser ocr | |
| 168 | - | |
| 169 | -# Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | |
| 170 | -cp ./usr/local/bin/ocr /usr/local/bin | |
| 171 | - | |
| 172 | -# Auto start (RedHat 6.7 e CentOs 6.9) | |
| 173 | -cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | |
| 174 | -mv /etc | |
| 175 | -chkconfig --add ocr | |
| 176 | -chkconfig --level 2345 ocr on | |
| 177 | - | |
| 178 | -# Auto start (Ubuntu 14.04) | |
| 179 | -cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | |
| 180 | -update-rd.d ocr defaults | |
| 181 | - | |
| 182 | -# Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | |
| 183 | -cd /home/ocr | |
| 184 | -tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | |
| 185 | -su | |
| 186 | - | |
| 187 | -# Copie o pacote para os outros servidores e extraia com: | |
| 188 | -cd / | |
| 189 | -tar xovzf pkg-ocr.tgz | |
| 190 | - | |
| 191 | -# Instalando pré-requisitos RUNTIME em servidores adicionais | |
| 192 | - | |
| 193 | -# Redhat 6.7 e CentOS 6.9 | |
| 194 | -yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp | |
| 195 | -yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | |
| 196 | - | |
| 197 | -# Ubuntu 14.04 | |
| 198 | -apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | |
| 199 | -apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 | |
| 200 | - | |
| 201 | -# Inicie o serviço com | |
| 202 | -service ocr start |
| ... | ... | @@ -0,0 +1,260 @@ |
| 1 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoees | |
| 2 | + | |
| 3 | +This script monitors a set of input directories for PDF files once a new file is detected, it is processes through tesseract OCR in order to generate a new file with a hidden searchable text layer | |
| 4 | + | |
| 5 | +It may be distributed under the conditions of the LGPL v2.1 license. | |
| 6 | + | |
| 7 | +Author: Guilherme Chehab | |
| 8 | + | |
| 9 | +## Version History: | |
| 10 | + - 0.1 | |
| 11 | + - Initial single server version | |
| 12 | + - 0.2 | |
| 13 | + - Check if page already has the html hidden layer, if so, ignore it | |
| 14 | + - 0.3 | |
| 15 | + - Solved issues about various image enconding types | |
| 16 | + - 0.4 | |
| 17 | + - Added a postnormalization step to ensure all output pdf pages have the same size and orientations as the original files | |
| 18 | + - 0.5 | |
| 19 | + - Used input file renaming as a way to sync multiple parallel instances, that way, it is minimized the risk of same file being OCRed multiple times. | |
| 20 | + - 0.6 | |
| 21 | + - Added a default handler for unknown image encoding using jpeg encoding | |
| 22 | + - 0.7 | |
| 23 | + - Solved an issue with files with more than 1000 pages | |
| 24 | + - 1.0 | |
| 25 | + - First release version | |
| 26 | + - 1.0.1 Solving error when file has no images | |
| 27 | + - 1.0.2 Fix bug when counting cores for AMD processors | |
| 28 | + - 1.0.3 Added better image type detection | |
| 29 | + - 1.0.4 Fix: added ubuntu init script | |
| 30 | + - 1.0.4b Add Centos 6.9 install instructions | |
| 31 | + - 2.0 | |
| 32 | + - PDF/A output, and better compression with ghostscript | |
| 33 | + - Rewritten image extration, processing and transformations process | |
| 34 | + - Check if input file is signed, in this case, does not change the file contents | |
| 35 | + - Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | |
| 36 | + - Use operating system packges by default | |
| 37 | + - Changed paths from external programs, instead of using full paths, uses first match from $PATH | |
| 38 | + - Check existence of external programs on path before running | |
| 39 | + - Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | |
| 40 | + - Fix: create subpaths on error folder | |
| 41 | + - Fix: trying to reduce overhead on temporary folder | |
| 42 | + | |
| 43 | +## TODO: | |
| 44 | + - Changes get_imgs and OCR processing to enable pages with more than one image -- it would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | |
| 45 | + - Review poppler and cpdf install instructions | |
| 46 | + - Add better handling of vectorized and non scanned pdf files | |
| 47 | + - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current scalling, cropping and rotation handlers | |
| 48 | + - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- added function to analyse image color histogram -> just need to add option to convert it to B&W. | |
| 49 | + - Move all parameters to config file | |
| 50 | + - Add some job control web interface | |
| 51 | + - Add end user interface to submit files through web | |
| 52 | + - Add check external programs version requirements before running | |
| 53 | + | |
| 54 | +## BUGS: | |
| 55 | + - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than original, this is due to using pdftoppm instead of pdfimages | |
| 56 | + | |
| 57 | +## Requirements: | |
| 58 | + - Perl 5.10.1, com seguintes módulos: | |
| 59 | + - File::Find::Rule | |
| 60 | + - File::Basename | |
| 61 | + - File::Copy | |
| 62 | + - File::Path | |
| 63 | + - File::Touch | |
| 64 | + - Sys::Syslog | |
| 65 | + - Sys::Hostname | |
| 66 | + - IPC::Open3 | |
| 67 | + - IO::Select | |
| 68 | + - POSIX | |
| 69 | + - Tesseract-ocr 3.05, com dicionários inglês e português | |
| 70 | + - Pdftk 2.02 | |
| 71 | + - Poppler-utils 0.42.0 | |
| 72 | + - Cpdf 2.1 | |
| 73 | + - ImageMagick 6.7.2-7 | |
| 74 | + - Ghostcript 9.18 | |
| 75 | + | |
| 76 | +Na ausência deles na distribuição do sistema operacional, o uso de versões antigas desses componentes podem comprometer o correto funcionamento do sistema | |
| 77 | + | |
| 78 | +Dessa forma, pode ser necessário compilar os componentes faltantes, assim como as bibliotecas necessárias para o seu correto funcionamento. | |
| 79 | + | |
| 80 | +Esse arquivo contem informações quanto aos procedimentos para instalar e configurar o sistema pressupondo o pior caso, qual seja, a necessidade de compilação dos componentes. | |
| 81 | + | |
| 82 | +ATENÇÃO: se algum componente abaixo não estiver disponível no repositório padrão para o Linux utilizado, deve-se proceder com a compilação da versão mais recente do componente disponibilizado em outros repositórios para que seja instalado no Linux a ser utilizado. | |
| 83 | + | |
| 84 | +### Configure o script, alterando as variáveis no arquivo '/usr/local/bin/ocr': | |
| 85 | + | |
| 86 | +- @BASE_DIRS: Lista de diretórios base para a busca de arquivos --> cada diretório base irá ter sua própria instância do script | |
| 87 | +- @SUB_DIRS: Subdiretórios de entrada, saída, backup do arquivos originais, temporário e de arquivos com erro | |
| 88 | +- $MAX_FILES: Número máximo de arquivos a serem processados simultaneamente por diretório de entrada (default: 2) | |
| 89 | +- $MAX_PGS: Número máximo de páginas que podem ser processadas simultanemante por arquivo de entrada (default: no. de CPUs) | |
| 90 | + | |
| 91 | +Essas variáveis controlam o número máximo de instâncias de processos simultâneas = Num. de diretorios X MAX_FILES X MAX_PGS. | |
| 92 | + | |
| 93 | +Recomenda-se que o equipamento tenha em torno de 1,5 GB de RAM para cada core de CPU de forma a evitar swap. Se isso não for possível, pode ser reduzido o número de processos ou arquivos simultâneos. | |
| 94 | + | |
| 95 | +A configuração do servidor pode ser dimensionada com base no tempo desejado para processamento de grandes arquivos (> 100 páginas). Cada página tem sua própria thread de processamento, até o limite de $MAX_PGS, cujo default é o no. de cores de CPU. Em média cada página demora em torno de 18 segundos em uma CPU Xeon E5 4670@2.6GHz. Assim, com 16 CPUs, o desempenho agregado é em torno de 1,2 segundos por página. | |
| 96 | + | |
| 97 | +Para operação multi instância, basta instalar quantos servidores forem necessários e eles podem ter acesso aos mesmos diretórios de entrada que podem ser compartilhamentos SAMBA/CIFS/Windows ou NFS. | |
| 98 | + | |
| 99 | + | |
| 100 | +# Container Docker | |
| 101 | + | |
| 102 | + O OCR-Server também está disponível como um container Docker, permitindo o rápido provisionamento da solução em ambiente de produção. Todos os procedimento para construção da imagem do container podem ser encontrados no arquivo Dockerfile. | |
| 103 | + | |
| 104 | + Para execução do serviço, basta que o docker instalado no servidor e executar o seguinte comando: | |
| 105 | + | |
| 106 | + docker run --name <NOME_CONTAINER> -d -v <DIRETORIO_BASE>:/var/ocr-server guilhermeadc/ocr-server | |
| 107 | + | |
| 108 | + Onde: | |
| 109 | + --name : Nome atribuído à instância do container. Ex: ocr-server | |
| 110 | + -d : Indicação executar o container em background | |
| 111 | + -v : Diretório de compartilhamento entre o servidor host e o container. | |
| 112 | + O parâmetro <DIRETORIO_BASE> deve ser substituído pelo diretório base para busca de arquivos. | |
| 113 | + | |
| 114 | + Para vistualizar os logs de processamento do serviço, basta executar o seguinte comando: | |
| 115 | + docker logs <NOME_CONTAINER> | |
| 116 | + | |
| 117 | + | |
| 118 | +# COMPILAÇÃO dos pré requisitos (obs.: os comandos devem ser executados como root) | |
| 119 | + | |
| 120 | +Em servidor Ubuntu 16.04, os pacotes padrão (com exceção do CPDF, que não tem no repositório oficial) | |
| 121 | +são suficientes para executar o aplicativo, não havendo necessidade de compilar todos, assim é a arquitetura recomendada | |
| 122 | + | |
| 123 | +Quanto ao CPDF, é possível baixar a versão binária em: https://github.com/coherentgraphics/cpdf-binaries | |
| 124 | + | |
| 125 | +## Compilando os pré-requisitos: máquina de COMPILAÇÃO APENAS | |
| 126 | + | |
| 127 | + # RedHat 6.7 e Centos 6.9: | |
| 128 | + yum -y install autoconf make gcc-java gcc gcc-c++ subversion pkg-config automake libtool yasm cmake git libgcj unzip | |
| 129 | + yum -y install libtiff-devel libpng-devel openjpeg-devel libjpeg-turbo-devel giflib-devel libwebp-devel zlib-devel libicu-devel pango-devel cairo-devel fontconfig-devel gettext-devel libcurl-devel nss-devel | |
| 130 | + cd /tmp | |
| 131 | + wget http://www.itzgeek.com/msttcore-fonts-2.0-3.noarch.rpm | |
| 132 | + rpm -Uvh msttcore-fonts-2.0-3.noarch.rpm | |
| 133 | + rm -f msttcore-fonts-2.0-3.noarch.rpm | |
| 134 | + | |
| 135 | + # Centos 6.9 | |
| 136 | + # \_ autoconf-archive | |
| 137 | + wget ftp://ftp.pbone.net/mirror/ftp5.gwdg.de/pub/opensuse/repositories/home:/pelliott11:/autoconf-archive/CentOS_CentOS-6/noarch/autoconf-archive-2012.04.07-7.3.noarch.rpm | |
| 138 | + rpm -i autoconf-archive-2012.04.07-7.3.noarch.rpm | |
| 139 | + rm autoconf-archive-2012.04.07-7.3.noarch.rpm | |
| 140 | + # \_ GCC 4.8 | |
| 141 | + wget http://people.centos.org/tru/devtools-2/devtools-2.repo -O /etc/yum.repos.d/devtools-2.repo | |
| 142 | + yum install devtoolset-2-gcc devtoolset-2-binutils devtoolset-2-gcc-c++ devtoolset-2-gcj | |
| 143 | + | |
| 144 | + # Ubuntu 14.04 Server: | |
| 145 | + apt-get install build-essential cmake libtool yasm pkg-config subversion git libgcj14 | |
| 146 | + apt-get install libtiff-dev libpng-dev libopenjpeg-dev libjpeg8-dev libjpeg-turbo8-dev libjpeg-dev libgif-dev zlib1g-dev libicu-dev libpango1.0-dev libcairo2-dev libfontconfig1-dev libgettextpo-dev libcurl-dev libnss3-dev | |
| 147 | + apt-get install ttf-mscorefonts-installer | |
| 148 | + | |
| 149 | + # Ambas plataformas: | |
| 150 | + cd /usr/local/src | |
| 151 | + | |
| 152 | + for i in \ | |
| 153 | + https://github.com/tesseract-ocr/langdata.git \ | |
| 154 | + https://github.com/DanBloomberg/leptonica.git \ | |
| 155 | + https://github.com/libav/libav.git \ | |
| 156 | + https://github.com/tesseract-ocr/tessdata.git \ | |
| 157 | + https://github.com/tesseract-ocr/tesseract.git \ | |
| 158 | + git://git.freedesktop.org/git/poppler/poppler.git \ | |
| 159 | + git://git.freedesktop.org/git/poppler/test.git \ | |
| 160 | + https://github.com/Flameeyes/unpaper.git \ | |
| 161 | + https://github.com/ocaml/ocaml.git \ | |
| 162 | + https://gitlab.camlcity.org/gerd/lib-findlib.git \ | |
| 163 | + https://github.com/johnwhitington/camlpdf.git \ | |
| 164 | + https://github.com/johnwhitington/cpdf-source.git \ | |
| 165 | + http://git.ghostscript.com/ghostpdl.git \ | |
| 166 | + ; do git clone $i; done | |
| 167 | + | |
| 168 | + wget https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/pdftk-2.02-src.zip | |
| 169 | + unzip pdftk-2.02-src.zip | |
| 170 | + rm -f pdftk-2.02-src.zip | |
| 171 | + | |
| 172 | + # pdftk, versão 2.02 ou superior | |
| 173 | + cd pdftk-2.02-dist/pdftk && make -f Makefile.Redhat all install && cd ../.. | |
| 174 | + | |
| 175 | + # Ghostscript 9.18 ou superior | |
| 176 | + #wget http://downloads.ghostscript.com/public/old-gs-releases/ghostscript-9.21.tar.gz | |
| 177 | + #tar xvozf ghostscript-9.21.tar.gz | |
| 178 | + #rm -f ghostscript-9.21.tar.gz | |
| 179 | + #cd ghostscript-9.21 | |
| 180 | + cd ghostpdl | |
| 181 | + ./autogen.sh; ./configure | |
| 182 | + make all install | |
| 183 | + cd .. | |
| 184 | + | |
| 185 | + # Centos 6.9 | |
| 186 | + # \_ Cria um novo shell usando o GCC 4.8 por default | |
| 187 | + scl enable devtoolset-2 bash | |
| 188 | + | |
| 189 | + # Tesseract, versão 3.05-dev ou superior | |
| 190 | + # Bibliotecas para o Tesseract: Leptonica e Libav | |
| 191 | + cd leptonica && ./autobuild && ./configure && make all install && cd .. | |
| 192 | + | |
| 193 | + # Para compilação do Tesseract após a compilação do leptonica | |
| 194 | + export PKG_CONFIG_PATH=/usr/lib:/usr/local/lib:/usr/local/src/leptonica/ | |
| 195 | + | |
| 196 | + cd libav && ./configure --enable-sram && make all install && cd .. | |
| 197 | + | |
| 198 | + # Tesseract | |
| 199 | + cd tesseract && ./autogen.sh && ./configure && make all install && cd .. | |
| 200 | + cp -avR tessdata/* /usr/local/share/tessdata/ | |
| 201 | + | |
| 202 | + # cpdf, versão 2.1 ou superior | |
| 203 | + cd ocaml && ./configure && make world.opt && make install && cd .. | |
| 204 | + mkdir -p /usr/local/man/man5 | |
| 205 | + # lib-findlib -- pode dar erro na instalação de páginas de man... é seguro ignorar, ou basta criar os diretórios faltantes e tentar novamente | |
| 206 | + cd lib-findlib && ./configure && make all && make install && cd .. | |
| 207 | + cd camlpdf && sed -i.bak s/\(uint32\)/\(uint32_t\)/g flatestubs.c && make && make install && cd .. | |
| 208 | + cd cpdf-source && make all && make install && cp cpdf /usr/local/bin && cd .. | |
| 209 | + | |
| 210 | + # poppler-utils, versão 0.42.0 ou superior | |
| 211 | + cd poppler && ./autogen.sh && ./configure --enable-cmyk --enable-libcurl && make all install && cd .. | |
| 212 | + | |
| 213 | + # Centos 6.9 | |
| 214 | + # \_ Termina o shell usando o GCC 4.8 por default | |
| 215 | + exit | |
| 216 | + | |
| 217 | + | |
| 218 | +## Comandos adicionais para configuração do módulo: | |
| 219 | + | |
| 220 | + # Criação do usuário | |
| 221 | + adduser ocr | |
| 222 | + | |
| 223 | + # Copie os arquivos ocr ocr-* para os diretórios corretos, conforme o sistema operacional | |
| 224 | + cp ./usr/local/bin/ocr /usr/local/bin | |
| 225 | + | |
| 226 | + # Auto start (RedHat 6.7 e CentOs 6.9) | |
| 227 | + cp ./usr/local/etc/init.d/ocr-redhat /etc/init.d/ocr | |
| 228 | + mv /etc | |
| 229 | + chkconfig --add ocr | |
| 230 | + chkconfig --level 2345 ocr on | |
| 231 | + | |
| 232 | + # Auto start (Ubuntu 14.04) | |
| 233 | + cp ./usr/local/etc/init.d/ocr-ubuntu /etc/init.d/ocr | |
| 234 | + update-rd.d ocr defaults | |
| 235 | + | |
| 236 | + # Create pkg -- para instalação em outras máquinas sem a necessidade de novas compilações | |
| 237 | + cd /home/ocr | |
| 238 | + tar cvozf pkg-ocr.tgz /usr/local/bin /usr/local/lib* /usr/local/man/ /usr/local/sbin/ /usr/local/share/ /usr/local/etc /usr/local/include/ /home/ocr/ocr* /etc/init.d/ocr /etc/rc*.d/*ocr | |
| 239 | + su | |
| 240 | + | |
| 241 | +# INSTALAÇÃO (obs.: os comandos devem ser executados como root) | |
| 242 | + # Criação do usuário | |
| 243 | + adduser ocr | |
| 244 | + | |
| 245 | + # Copie o pacote para os outros servidores e extraia com: | |
| 246 | + cd / | |
| 247 | + tar xovzf pkg-ocr.tgz | |
| 248 | + | |
| 249 | + # Instalando pré-requisitos RUNTIME em servidores adicionais | |
| 250 | + | |
| 251 | + # Redhat 6.7 e CentOS 6.9 | |
| 252 | + yum -y install perl-File-Find-Rule-Perl perl-File-Touch libtiff libpng openjpeg-libs libjpeg-turbo giflib zlib libicu pango cairo fontconfig ImageMagick gettext libwebp ghostscript | |
| 253 | + yum -y install libtiff libpng openjpeg libjpeg-turbo giflib libwebp zlib libicu pango cairo fontconfig gettext | |
| 254 | + | |
| 255 | + # Ubuntu 14.04 | |
| 256 | + apt-get install libfile-find-rule-perl libfile-find-rule-perl-perl libtiff5 libpng12-0 libopenjpeg2 libjpeg-turbo8 libgif4 zlib1g libicu52 libpango1.0-0 libcairo2 fontconfig imagemagick gettext libwebp5 # libgcj14 | |
| 257 | + apt-get install libtiff5 libpng12-0 libopenjpeg2 libjpeg8 libjpeg-turbo8 libjpeg8 zlib1g libpango1.0-0 libcairo2 libfontconfig1 libgettextpo0 ghostscript | |
| 258 | + | |
| 259 | +# Inicie o serviço com | |
| 260 | + service ocr start | ... | ... |
| ... | ... | @@ -0,0 +1,17 @@ |
| 1 | +#!/usr/bin/env bash | |
| 2 | + | |
| 3 | +# Inicializa serviço de log | |
| 4 | +/etc/init.d/rsyslog start | |
| 5 | + | |
| 6 | +# Cria estrutura de pastas para monitoramento de arquivos | |
| 7 | +mkdir -p /var/ocr-server/ | |
| 8 | +mkdir -p /var/ocr-server/Entrada | |
| 9 | +mkdir -p /var/ocr-server/Saida | |
| 10 | +mkdir -p /var/ocr-server/Originais_Processados | |
| 11 | +mkdir -p /var/ocr-server/Erro | |
| 12 | +chmod -R 777 /var/ocr-server | |
| 13 | + | |
| 14 | +# Iniciar serviço do OCR-Server | |
| 15 | +service ocr start | |
| 16 | + | |
| 17 | +tail -f /var/log/syslog | |
| 0 | 18 | \ No newline at end of file | ... | ... |
usr/local/bin/ocr
| 1 | -#! /usr/bin/perl -w | |
| 1 | +#!/usr/bin/perl -w | |
| 2 | 2 | # |
| 3 | -# OCR Server 1.0.4 - (c) Agencia Nacional de Telecomunicacoes | |
| 3 | +# OCR Server 2.0 - (c) Agencia Nacional de Telecomunicacoes | |
| 4 | 4 | # |
| 5 | 5 | # This script monitors a set of input directories for PDF files |
| 6 | 6 | # once a new file is detected, it is processes through tesseract OCR |
| ... | ... | @@ -24,15 +24,38 @@ |
| 24 | 24 | # 1.0.1 Solving error when file has no images |
| 25 | 25 | # 1.0.2 Fix bug when counting cores for AMD processors |
| 26 | 26 | # 1.0.3 Added better image type detection |
| 27 | -# 1.0.4 Fix: added ubuntu init script | |
| 27 | +# 1.0.4 Fix: added ubuntu init script | |
| 28 | +# 1.0.4b Add Centos 6.9 install instructions | |
| 29 | +# 2.0 PDF/A output, and better compression with ghostscript --> for this to work, Tesseract 4.0 is | |
| 30 | +# strongly recomended | |
| 31 | +# Rewritten image extration, processing and transformations process | |
| 32 | +# Check if input file is signed, in this case, does not change the file contents | |
| 33 | +# Added '-oem 0' option to tesseract (force legacy mode on tesseract 4) | |
| 34 | +# Use operating system packges by default | |
| 35 | +# Changed paths from external programs, instead of using full paths, uses first match from $PATH | |
| 36 | +# Check existence of external programs on path before running | |
| 37 | +# Add support for stencil type and image encoding scans, changed default extraction method for unknown types/encodings | |
| 38 | +# Fix: create subpaths on error folder | |
| 39 | +# Fix: trying to reduce overhead on temporary folder | |
| 28 | 40 | # |
| 29 | 41 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
| 30 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
| 31 | -# diferently but does not treat it adequately | |
| 42 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.0.1 counts them | |
| 43 | +# diferently but does not treat it adequately -- shall require better pdf´s internal structure handling | |
| 32 | 44 | # - Review poppler and cpdf install instructions |
| 33 | 45 | # - Add better handling of vectorized and non scanned pdf files |
| 34 | -# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
| 35 | -# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
| 46 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) -- harder with current | |
| 47 | +# scalling, cropping and rotation handlers | |
| 48 | +# - Check mean saturation for additional colored images detection and automatically convert to B&W if possible -- | |
| 49 | +# added function to analyse image color histogram -> just need to add option to convert it to B&W. | |
| 50 | +# - Move all parameters to config file | |
| 51 | +# - Add some job control web interface | |
| 52 | +# - Add end user interface to submit files through web | |
| 53 | +# - Add check external programs version requirements before running | |
| 54 | +# | |
| 55 | +# BUGS: - When image is of type stencil or encoding image, cropping information is lost, and page is shown different than | |
| 56 | +# original, this is due to using pdftoppm instead of pdfimages | |
| 57 | +# - Although not properly a BUG, in the new version, the addition of a step do convert do PDF/A and other evolutions | |
| 58 | +# increased significantly the time do OCR a page, from a mean time of 1 secs/page to 3 secs/page on a 16 core server | |
| 36 | 59 | # |
| 37 | 60 | # Check software requirements on the comments bellow |
| 38 | 61 | # |
| ... | ... | @@ -55,7 +78,7 @@ use IPC::Open3; |
| 55 | 78 | use IO::Select; |
| 56 | 79 | |
| 57 | 80 | my $DEBUG = 0; |
| 58 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | |
| 81 | +my $MAX_PGS = ($DEBUG==2 ? 1 : 0 + `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | |
| 59 | 82 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
| 60 | 83 | |
| 61 | 84 | my $USER = 'ocr'; |
| ... | ... | @@ -63,39 +86,48 @@ my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it ca |
| 63 | 86 | |
| 64 | 87 | # Command dependencies |
| 65 | 88 | |
| 66 | -# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher | |
| 67 | -my $TESSERACT = '/usr/local/bin/tesseract -l por+eng'; | |
| 89 | +# depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher -- for pdf/a Tesseract 4.0 is recomended | |
| 90 | +my $TESSERACT = 'tesseract --oem 0'; # if Tesseract => 4.0 | |
| 91 | +#my $TESSERACT = 'tesseract'; # if Tesseract < 4.0 | |
| 68 | 92 | |
| 69 | 93 | # Depends on pdftk 2.02 or higher |
| 70 | -my $PDFTK = '/usr/local/bin/pdftk'; | |
| 94 | +my $PDFTK = 'pdftk'; | |
| 71 | 95 | |
| 72 | 96 | # Depends on poppler-utils 0.42.0 or higher |
| 73 | -#my $PDINFO = '/usr/local/bin/pdfinfo'; | |
| 74 | -my $PDFFONTS = '/usr/local/bin/pdffonts'; | |
| 75 | -my $PDFIMAGES = '/usr/local/bin/pdfimages'; | |
| 76 | -my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | |
| 97 | +my $PDFFONTS = 'pdffonts'; | |
| 98 | +my $PDFIMAGES = 'pdfimages'; | |
| 99 | +my $PDFTOPPM = 'pdftoppm'; | |
| 100 | +my $PDFUNITE = 'pdfunite'; | |
| 101 | +my $PDFSIG = 'pdfsig'; | |
| 77 | 102 | |
| 78 | 103 | # Depends on cpdf 2.1 or higher |
| 79 | -my $CPDF = '/usr/local/bin/cpdf'; | |
| 104 | +my $CPDF = 'cpdf'; | |
| 105 | + | |
| 106 | +# Depends on Ghostscript 9.18 | |
| 107 | +my $GS = 'gs'; | |
| 80 | 108 | |
| 81 | 109 | ## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner |
| 82 | -my $CONVERT = '/usr/bin/convert'; | |
| 110 | +my $CONVERT = 'convert'; | |
| 83 | 111 | |
| 84 | 112 | # If it is needed further filtering |
| 85 | 113 | #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; |
| 86 | 114 | |
| 87 | -my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | |
| 88 | - '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); | |
| 115 | +#my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | |
| 116 | +# '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); | |
| 117 | + | |
| 118 | +my @BASE_DIRS = ('/var/ocr-server/'); | |
| 89 | 119 | |
| 90 | 120 | my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); |
| 91 | 121 | |
| 92 | 122 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
| 93 | -%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); | |
| 123 | +%SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG==2); | |
| 94 | 124 | |
| 95 | 125 | # Safeguard im case of cpuinfo has not identified correctly the number of CPUs |
| 96 | 126 | $MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; |
| 97 | 127 | |
| 98 | -$ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; | |
| 128 | +$ENV{'PATH'} = '/usr/local/bin:/usr/bin:/bin'; | |
| 129 | +$ENV{'IFS'} = '\t\n'; | |
| 130 | + | |
| 99 | 131 | my ($host) = split/\./,hostname; |
| 100 | 132 | |
| 101 | 133 | use vars qw/*name *dir *prune/; |
| ... | ... | @@ -107,14 +139,15 @@ sub main; |
| 107 | 139 | sub get_pages; |
| 108 | 140 | sub get_rotation; |
| 109 | 141 | sub get_res; |
| 110 | -sub is_ocred; | |
| 111 | 142 | sub is_locked_ex; |
| 112 | 143 | |
| 113 | 144 | |
| 114 | 145 | my $expr = 'use POSIX qw(setsid)'; |
| 115 | 146 | |
| 116 | 147 | my ($dumb1, $dumb2, $uid) = getpwnam ($USER); |
| 117 | -setuid ($uid) or warn "Cant set uid $uid"; | |
| 148 | +if (defined $uid) { | |
| 149 | + setuid ($uid) or warn "Cant set uid $uid"; | |
| 150 | +} | |
| 118 | 151 | |
| 119 | 152 | $SIG{__DIE__} = 'DEFAULT'; |
| 120 | 153 | $SIG{__WARN__} = \&die_when_called; |
| ... | ... | @@ -126,6 +159,11 @@ if ($@) { |
| 126 | 159 | chdir('/') or die "$0: cannot chdir '/': $!\n"; |
| 127 | 160 | open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; |
| 128 | 161 | |
| 162 | +foreach my $exec ( $TESSERACT, $PDFTK, $PDFFONTS, $PDFIMAGES, $PDFSIG, $CPDF, $GS, $CONVERT) { | |
| 163 | + die "Error: $exec not found on path: $ENV{PATH}, check dependencies\n" if ( `which $exec | wc -l ` == 0); | |
| 164 | +} | |
| 165 | + | |
| 166 | + | |
| 129 | 167 | foreach my $DIR (@BASE_DIRS) { |
| 130 | 168 | |
| 131 | 169 | defined(my $pid = fork) or die "$0: cannot fork: $!\n"; |
| ... | ... | @@ -135,7 +173,7 @@ foreach my $DIR (@BASE_DIRS) { |
| 135 | 173 | main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); |
| 136 | 174 | exit 0; |
| 137 | 175 | last; |
| 138 | - } | |
| 176 | + } | |
| 139 | 177 | } |
| 140 | 178 | |
| 141 | 179 | exit 0; |
| ... | ... | @@ -157,7 +195,7 @@ sub main { |
| 157 | 195 | # remove .tmp file |
| 158 | 196 | unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); |
| 159 | 197 | |
| 160 | - # Rename files that were in 'processig' back | |
| 198 | + # Rename files that were in 'processing' state back | |
| 161 | 199 | foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { |
| 162 | 200 | my $old_name = $file; |
| 163 | 201 | $old_name =~ s/\.${host}\.processing$//g; |
| ... | ... | @@ -177,12 +215,14 @@ sub main { |
| 177 | 215 | # Main loop |
| 178 | 216 | while ( 1 ) { |
| 179 | 217 | select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced |
| 218 | + | |
| 180 | 219 | $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); |
| 181 | 220 | print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; |
| 182 | 221 | $count = scalar keys %files_in; |
| 183 | - foreach my $file (keys %files_in) { | |
| 184 | 222 | |
| 185 | - next if ( glob ("$file.*.tmp")); | |
| 223 | + foreach my $file (sort { ((-f $a) ? (stat $a)[9] : 0) <=> ((-f $b) ? (stat $b)[9] : 0)} keys %files_in ) { | |
| 224 | + | |
| 225 | + next if ( glob ("\"$file.*.tmp\"")); | |
| 186 | 226 | |
| 187 | 227 | select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds |
| 188 | 228 | next if (!defined $files_in{$file}); # continue only if it is still valid |
| ... | ... | @@ -255,7 +295,7 @@ sub ocr { |
| 255 | 295 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
| 256 | 296 | unlink ("$in_file.$host.tmp"); |
| 257 | 297 | move ( "$in_file.$host.processing", $in_file); |
| 258 | - exit 0; | |
| 298 | + exit 1; | |
| 259 | 299 | }; |
| 260 | 300 | |
| 261 | 301 | my $out_path = $in_path; |
| ... | ... | @@ -271,7 +311,7 @@ sub ocr { |
| 271 | 311 | my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); |
| 272 | 312 | |
| 273 | 313 | print "\twritting to $out_file\n" if $DEBUG; |
| 274 | - | |
| 314 | + | |
| 275 | 315 | my $stime = time; |
| 276 | 316 | my %pids; |
| 277 | 317 | |
| ... | ... | @@ -291,8 +331,26 @@ sub ocr { |
| 291 | 331 | remove_tree ($tmpdir,{ error=> \my $dumb }); |
| 292 | 332 | unlink ("$in_file.$host.tmp"); |
| 293 | 333 | move ( "$in_file.$host.processing", $in_file); |
| 334 | + print "Error: cannot copy $in_file to temp dir \n" if $DEBUG; | |
| 335 | + syslog ("error","cannot copy $in_file to temp dir") if !$DEBUG; | |
| 336 | + exit 1; | |
| 294 | 337 | }; |
| 295 | 338 | |
| 339 | + # Check if file was signed | |
| 340 | + if (get_sign($tmp_file)) { | |
| 341 | + if (!copy ("$in_file.$host.processing", $proc_file)) { | |
| 342 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | |
| 343 | + unlink ("$in_file.$host.tmp"); | |
| 344 | + move ( "$in_file.$host.processing", $in_file); | |
| 345 | + }; | |
| 346 | + move ("$in_file.$host.processing", $out_file); | |
| 347 | + unlink ("$in_file.$host.tmp"); | |
| 348 | + print "OCR processed: $in_file not OCRed due to having a signature within" if $DEBUG; | |
| 349 | + syslog ("info","OCR processed: $in_file not OCRed due to having a signature within") if !$DEBUG; | |
| 350 | + | |
| 351 | + exit 0; | |
| 352 | + } | |
| 353 | + | |
| 296 | 354 | # Extract pages |
| 297 | 355 | ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); |
| 298 | 356 | if ($DEBUG) { |
| ... | ... | @@ -301,12 +359,13 @@ sub ocr { |
| 301 | 359 | print "\t\t\t$_" for @err ; |
| 302 | 360 | }; |
| 303 | 361 | |
| 362 | + my ($pages, @pg_w, @pg_h, @pg_r, @pg_crop_x1, @pg_crop_y1, @pg_crop_x2, @pg_crop_y2); | |
| 363 | + $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r, \@pg_crop_x1, \@pg_crop_y1, \@pg_crop_x2, \@pg_crop_y2); | |
| 304 | 364 | |
| 305 | - my ($pages, @pg_w, @pg_h, @pg_r); | |
| 306 | - $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r); | |
| 365 | + my ($imgs,@page_img, @img_w, @img_h, @img_t, @img_xppi, @img_yppi); | |
| 366 | + $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t, \@img_xppi, \@img_yppi); | |
| 307 | 367 | |
| 308 | - my ($imgs,@page_img, @img_w, @img_h, @img_t); | |
| 309 | - $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t); | |
| 368 | + unlink ($tmp_file) if (!$DEBUG); | |
| 310 | 369 | |
| 311 | 370 | for ( my $i=0; $i< $pages; $i++ ) { |
| 312 | 371 | my $pg = sprintf ("pg_%06d", $i+1); |
| ... | ... | @@ -333,25 +392,29 @@ sub ocr { |
| 333 | 392 | if (! defined $img_t[$i] ) { |
| 334 | 393 | move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); |
| 335 | 394 | print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; |
| 336 | - exit 0; | |
| 395 | + exit -1; | |
| 337 | 396 | } |
| 338 | 397 | |
| 339 | - print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG; | |
| 398 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i] " if $DEBUG; | |
| 399 | + print "(cropbox: $pg_crop_x1[$i] x $pg_crop_y1[$i] - $pg_crop_x2[$i] x $pg_crop_y2[$i])\n" if (defined $pg_crop_x1[$i] && $DEBUG); | |
| 400 | + print "\n" if ($DEBUG); | |
| 340 | 401 | |
| 402 | + # Extract images from page, since 2.0 uses png lossless format regardless of original format or depth | |
| 341 | 403 | undef $cmd; |
| 342 | 404 | |
| 343 | - if ($img_t[$i] eq "gray") { | |
| 344 | - $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
| 405 | + # Use PDFIMAGES and JPEG by default | |
| 406 | + $cmd = "${PDFIMAGES} -j \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
| 407 | + | |
| 408 | + if ($img_t[$i] eq "stencil") { | |
| 409 | + $cmd = "${PDFTOPPM} -tiff -tiffcompression deflate -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
| 345 | 410 | } |
| 346 | 411 | |
| 347 | - if ($img_t[$i] eq "rgb") { | |
| 348 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
| 349 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | |
| 412 | + if ($img_t[$i] eq "gray") { | |
| 413 | + $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
| 350 | 414 | } |
| 351 | 415 | |
| 352 | - if (!defined $cmd) { | |
| 353 | - $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
| 354 | - $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM | |
| 416 | + if ($img_t[$i] !~ /gray|rgb|stencil/) { | |
| 417 | + $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; | |
| 355 | 418 | } |
| 356 | 419 | |
| 357 | 420 | ($exit,$cmd,@out,@err) = exec_cmd($cmd); |
| ... | ... | @@ -362,7 +425,13 @@ sub ocr { |
| 362 | 425 | }; |
| 363 | 426 | |
| 364 | 427 | # Process each resulting image for page pdf |
| 365 | - my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ; | |
| 428 | + my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif|tiff|jpeg|jp2|jb2|png)/i , in => ${tmpdir} )) ; | |
| 429 | + | |
| 430 | + if (scalar @images == 0) { | |
| 431 | + move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); | |
| 432 | + print "\t\t${in_file}: ".(${i}+1)." / $pages: Page was not exported as a tesseract supported format -- not OCRing\n" if $DEBUG; | |
| 433 | + exit 0; | |
| 434 | + } | |
| 366 | 435 | |
| 367 | 436 | foreach my $image (@images) { |
| 368 | 437 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
| ... | ... | @@ -378,43 +447,65 @@ sub ocr { |
| 378 | 447 | print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; |
| 379 | 448 | } |
| 380 | 449 | } |
| 381 | - | |
| 382 | - # Check if page was rotated | |
| 383 | - if ($pg_r[$i]) { | |
| 384 | - print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG; | |
| 385 | - ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\""); | |
| 450 | + | |
| 451 | + # Check if page was rotated and extracted with pdftoppm | |
| 452 | + if ($cmd =~ /\Q$PDFTOPPM/ && $pg_r[$i]) { | |
| 453 | + print "\t\t\t${image} unrotate: -$pg_r[$i] degs ".(${i}+1)." / $pages\n" if $DEBUG; | |
| 454 | + ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate ". (360 - $pg_r[$i])." \"$image\""); | |
| 386 | 455 | if ($DEBUG) { |
| 387 | 456 | print "\t\t\t${image} -> $cmd: $exit\n"; |
| 388 | 457 | print "\t\t\t\t$_" for @out ; |
| 389 | 458 | print "\t\t\t\t$_" for @err ; |
| 390 | 459 | }; |
| 391 | 460 | } |
| 392 | - | |
| 461 | + | |
| 393 | 462 | # Filter ppm images, if needed |
| 394 | 463 | |
| 395 | 464 | # OCR ppm images to pdf pages |
| 396 | - ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf"); | |
| 465 | + ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} -l por+eng \"${image}\" \"${image}\" pdf"); | |
| 397 | 466 | if ($DEBUG) { |
| 398 | 467 | print "\t\t\t${image} -> $cmd: $exit\n"; |
| 399 | 468 | print "\t\t\t\t$_" for @out ; |
| 400 | 469 | print "\t\t\t\t$_" for @err ; |
| 401 | 470 | }; |
| 471 | + unlink ("$image") if (!$DEBUG); | |
| 402 | 472 | |
| 403 | - # Scale to fit pdf | |
| 404 | - ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | |
| 473 | + # Scale, crop and rotate to fit pdf | |
| 474 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); | |
| 405 | 475 | if ($DEBUG) { |
| 406 | 476 | print "\t\t\t${image} -> $cmd: $exit\n"; |
| 407 | 477 | print "\t\t\t\t$_" for @out ; |
| 408 | 478 | print "\t\t\t\t$_" for @err ; |
| 409 | 479 | }; |
| 480 | + unlink ("$image.pdf") if (!$DEBUG); | |
| 410 | 481 | |
| 482 | + if (defined $pg_crop_x1[$i]) { | |
| 483 | + # adjust cropbox | |
| 484 | + ($pg_crop_x1[$i], $pg_crop_y1[$i],$pg_crop_x2[$i],$pg_crop_y2[$i]) = ( | |
| 485 | + ($pg_crop_x1[$i]<$pg_crop_x2[$i]?$pg_crop_x1[$i]:$pg_crop_x2[$i]), | |
| 486 | + ($pg_crop_y1[$i]<$pg_crop_y2[$i]?$pg_crop_y1[$i]:$pg_crop_y2[$i]), | |
| 487 | + abs($pg_crop_x2[$i]-$pg_crop_x1[$i]),abs($pg_crop_y2[$i]- $pg_crop_y1[$i]) | |
| 488 | + ); | |
| 489 | + | |
| 490 | + ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -crop \"$pg_crop_x1[$i] $pg_crop_y1[$i] $pg_crop_x2[$i] $pg_crop_y2[$i]\" \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | |
| 491 | + if ($DEBUG) { | |
| 492 | + print "\t\t\t${image} -> $cmd: $exit\n"; | |
| 493 | + print "\t\t\t\t$_" for @out ; | |
| 494 | + print "\t\t\t\t$_" for @err ; | |
| 495 | + }; | |
| 496 | + } | |
| 497 | + | |
| 498 | + if ($pg_r[$i]) { | |
| 499 | + ($exit,$cmd, @out,@err) = exec_cmd( "${CPDF} -rotate $pg_r[$i] \"${image}\"-cpdf.pdf -o \"${image}\"-cpdf.pdf"); | |
| 500 | + if ($DEBUG) { | |
| 501 | + print "\t\t\t${image} -> $cmd: $exit\n"; | |
| 502 | + print "\t\t\t\t$_" for @out ; | |
| 503 | + print "\t\t\t\t$_" for @err ; | |
| 504 | + }; | |
| 505 | + } | |
| 411 | 506 | |
| 412 | - unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG); | |
| 413 | - unlink ("$image.pdf") if (!$DEBUG); | |
| 414 | - move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG); | |
| 415 | - unlink ("$image") if (!$DEBUG); | |
| 416 | 507 | } |
| 417 | - exit 0; | |
| 508 | + exit 1; | |
| 418 | 509 | } |
| 419 | 510 | } |
| 420 | 511 | |
| ... | ... | @@ -427,28 +518,51 @@ sub ocr { |
| 427 | 518 | |
| 428 | 519 | if (scalar @new_pages != $pages) { |
| 429 | 520 | print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); |
| 430 | - syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG); | |
| 521 | + syslog ("err","OCR: $in_file, number of output pages differ") if (!$DEBUG); | |
| 431 | 522 | unlink "$in_file.$host.tmp"; |
| 523 | + make_path ($error_path) if ( ! -d $error_path); | |
| 432 | 524 | move ("$in_file.$host.processing", $error_file); |
| 433 | - exit (0); | |
| 525 | + exit (1); | |
| 434 | 526 | } |
| 435 | 527 | |
| 436 | - # Merge resulting pdf pages to a single pdf | |
| 528 | + # Merge resulting pdf pages to a single pdf, convert to PDF/A and copy to output | |
| 437 | 529 | make_path ($out_path) if ( ! -d $out_path); |
| 438 | 530 | unlink $out_file if ( -f $out_file ); |
| 439 | - ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress"); | |
| 531 | + | |
| 532 | + chdir (${tmpdir}); | |
| 533 | + ($exit, $cmd, @out,@err) = exec_cmd("${GS} -dQUIET -dBATCH -dNOPAUSE -dNOINTERPOLATE -dCompatibilityLevel=1.7 -dNumRenderingThreads=${MAX_PGS} -sDEVICE=pdfwrite -dAutoRotatePages=/None -sColorConversionStrategy=/RGB -sProcessColorModel=DeviceRGB -dAutoFilterColorImages=true -dAutoFilterGrayImages=true -dJPEGQ=95 -dPDFA=2 -dPDFACompatibilityPolicy=1 -sOutputFile=\"${tmp_file}\" pg_*-cpdf.pdf "); | |
| 440 | 534 | if ($DEBUG) { |
| 441 | 535 | print "\t\t${out_file} -> $cmd: $exit\n"; |
| 442 | 536 | print "\t\t\t$_" for @out ; |
| 443 | 537 | print "\t\t\t$_" for @err ; |
| 444 | 538 | }; |
| 539 | + if ($exit) { | |
| 540 | + unlink "$in_file.$host.tmp"; | |
| 541 | + unlink $out_file; | |
| 542 | + make_path ($error_path) if ( ! -d $error_path); | |
| 543 | + move ("$in_file.$host.processing", $error_file); | |
| 544 | + print "\t\t${out_file} -> Error concatenating pages and converting to PDF/A (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); | |
| 545 | + syslog ("err","OCR: $in_file, error concatenating pages and converting to PDF/A") if (!$DEBUG); | |
| 546 | + exit (1); | |
| 547 | + } | |
| 548 | + chdir ("/"); | |
| 549 | + | |
| 550 | + if (!copy (${tmp_file}, $out_file)) { | |
| 551 | + remove_tree ($tmpdir,{ error=> \my $dumb }); | |
| 552 | + unlink ("$in_file.$host.tmp"); | |
| 553 | + unlink $out_file; | |
| 554 | + make_path ($error_path) if ( ! -d $error_path); | |
| 555 | + move ("$in_file.$host.processing", $error_file); | |
| 556 | + print "Error: cannot copy temp file to $out_file \n" if $DEBUG; | |
| 557 | + syslog ("error","cannot copy temp file to $out_file") if !$DEBUG; | |
| 558 | + exit 1; | |
| 559 | + }; | |
| 445 | 560 | |
| 446 | 561 | make_path ($proc_path) if ( ! -d $proc_path); |
| 447 | 562 | unlink $proc_file if ( -f $proc_file ); |
| 448 | 563 | move ("$in_file.$host.processing", $proc_file); |
| 449 | 564 | move ("${out_file}.tmp", ${out_file}); |
| 450 | 565 | |
| 451 | - | |
| 452 | 566 | # Remove temp dir |
| 453 | 567 | remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); |
| 454 | 568 | unlink $tmp_file if (!$DEBUG); |
| ... | ... | @@ -471,7 +585,7 @@ sub is_ocred { |
| 471 | 585 | } |
| 472 | 586 | |
| 473 | 587 | sub get_pages { |
| 474 | - my ($in_file, $w, $h, $r) = @_; | |
| 588 | + my ($in_file, $w, $h, $r, $x1, $y1, $x2, $y2) = @_; | |
| 475 | 589 | |
| 476 | 590 | my $pages=0; |
| 477 | 591 | my $i=0; |
| ... | ... | @@ -485,29 +599,35 @@ sub get_pages { |
| 485 | 599 | ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); |
| 486 | 600 | ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); |
| 487 | 601 | ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); |
| 488 | - ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); | |
| 602 | + ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); | |
| 603 | + ($dumb, @$x1[$i-1], @$y1[$i-1], @$x2[$i-1], @$y2[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaCropRect:/ ); | |
| 489 | 604 | } |
| 490 | 605 | |
| 491 | 606 | return $pages; |
| 492 | 607 | } |
| 493 | 608 | |
| 494 | 609 | sub get_imgs { |
| 495 | - my ($in_file, $page_img, $w, $h, $t) = @_; | |
| 496 | - my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); | |
| 610 | + my ($in_file, $page_img, $w, $h, $t, $x_ppi, $y_ppi) = @_; | |
| 611 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi ); | |
| 497 | 612 | |
| 498 | 613 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
| 614 | + $i = 0; | |
| 499 | 615 | |
| 500 | 616 | foreach my $line (@lines) { |
| 501 | 617 | chomp $line; |
| 502 | 618 | $line =~ s/^ {1,}//; |
| 503 | - if ( $line =~ /image|mask/ ) { | |
| 504 | - ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; | |
| 619 | + if ( $line !~ /^page|^----/ ) { | |
| 620 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc, $int, $obj, $id, $xppi, $yppi) = split / {1,}/,$line; | |
| 505 | 621 | @$page_img[$page-1]=$i; |
| 506 | 622 | @$w[$page-1] = $width; |
| 507 | 623 | @$h[$page-1] = $height; |
| 508 | 624 | @$t[$page-1] = "rgb"; # Default is color |
| 509 | - @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | |
| 510 | 625 | @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); |
| 626 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | |
| 627 | + @$t[$page-1] = ( $type eq "stencil" ? $type : @$t[$page-1]); | |
| 628 | + @$t[$page-1] = ( $enc eq "image" ? $enc : @$t[$page-1]); | |
| 629 | + @$x_ppi[$page-1] = $xppi; | |
| 630 | + @$y_ppi[$page-1] = $yppi; | |
| 511 | 631 | } |
| 512 | 632 | } |
| 513 | 633 | return $i+1; |
| ... | ... | @@ -542,6 +662,19 @@ sub get_res { |
| 542 | 662 | return ($res_x,$res_y); |
| 543 | 663 | } |
| 544 | 664 | |
| 665 | +sub get_sign { | |
| 666 | + my ($in_file) = @_; | |
| 667 | + my @lines = `${PDFSIG} \"${in_file}\" 2>/dev/null`; | |
| 668 | + | |
| 669 | + foreach (@lines) { | |
| 670 | + chomp; | |
| 671 | + if ( $_ =~ /^Signature/ ) { | |
| 672 | + return 1; | |
| 673 | + } | |
| 674 | + } | |
| 675 | + return 0; | |
| 676 | +} | |
| 677 | + | |
| 545 | 678 | sub is_locked_ex { |
| 546 | 679 | my ($path) = @_; |
| 547 | 680 | ... | ... |
workflow.pdf
No preview for this file type
workflow.vsd
No preview for this file type