From 740cf3943a002c48823f498675e6b2927e4b90b3 Mon Sep 17 00:00:00 2001 From: Nei Jobson Date: Wed, 28 Sep 2016 10:01:57 -0300 Subject: [PATCH] Correções afetas e melhorias. --- INSTALL.txt | 14 ++++++++++---- usr/local/bin/ocr | 40 +++++++++++++++++++++++++++++++--------- 2 files changed, 41 insertions(+), 13 deletions(-) diff --git a/INSTALL.txt b/INSTALL.txt index 0972b6a..cb55cc5 100644 --- a/INSTALL.txt +++ b/INSTALL.txt @@ -20,10 +20,16 @@ # 0.7 Solved an issue with files with more than 1000 pages # 1.0 First release version # 1.0.1 Solving error when file has no images -# -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them -# diferently but does not treat it adequately +# 1.0.2 Fix bug when counting cores for AMD processors +# 1.0.3 Added better image type detection +# +# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it +# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them +# diferently but does not treat it adequately +# - Review poppler and cpdf install instructions +# - Add better handling of vectorized and non scanned pdf files +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible # # Check software requirements on the comments bellow # diff --git a/usr/local/bin/ocr b/usr/local/bin/ocr index e024841..3b7b25f 100644 --- a/usr/local/bin/ocr +++ b/usr/local/bin/ocr @@ -22,10 +22,16 @@ # 0.7 Solved an issue with files with more than 1000 pages # 1.0 First release version # 1.0.1 Solving error when file has no images +# 1.0.2 Fix bug when counting cores for AMD processors +# 1.0.3 Added better image type detection # # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them # diferently but does not treat it adequately +# - Review poppler and cpdf install instructions +# - Add better handling of vectorized and non scanned pdf files +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible # # Check software requirements on the comments bellow # @@ -48,10 +54,11 @@ use IPC::Open3; use IO::Select; my $DEBUG = 0; -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep CPU | wc -l`); +my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; my $USER = 'ocr'; +my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it can be converted to gray scale or B&W # Command dependencies @@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm'; # Depends on cpdf 2.1 or higher my $CPDF = '/usr/local/bin/cpdf'; -# Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner -#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; - +## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner my $CONVERT = '/usr/bin/convert'; +# If it is needed further filtering +#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; + my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); @@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); +# Safeguard im case of cpuinfo has not identified correctly the number of CPUs +$MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; my ($host) = split/\./,hostname; @@ -355,6 +365,18 @@ sub ocr { foreach my $image (@images) { print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; + + # Check if image can be safely colour reduced + if ($CHECK_COLOR) { + $cmd = "${CONVERT} ${image} \Q(\E -clone 0 -colorspace gray \Q)\E -compose difference -composite -separate -evaluate-sequence mean -threshold 4% -format \"%[fx:mean]\" info:"; + ($exit, $cmd, @out, @err) = exec_cmd ($cmd); + if ($DEBUG) { + print "\t\t\t${image}-> ${cmd}: $exit\n"; + #print "\t\t\t\t$_" for @out ; + print "\t\t\t\t$_" for @err ; + print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; + } + } # Check if page was rotated if ($pg_r[$i]) { @@ -470,7 +492,7 @@ sub get_pages { sub get_imgs { my ($in_file, $page_img, $w, $h, $t) = @_; - my ($dumb, $i, $page, $width, $height, $type); + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); @@ -478,13 +500,13 @@ sub get_imgs { chomp $line; $line =~ s/^ {1,}//; if ( $line =~ /image|mask/ ) { - ($page, $i , $dumb, $width, $height, $type) = split / {1,}/,$line; + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; @$page_img[$page-1]=$i; @$w[$page-1] = $width; @$h[$page-1] = $height; - @$t[$page-1] = ( $type eq "-" ? "rgb" : $type ); - @$t[$page-1] = ( $type eq "icc" ? "rgb" : $type ); - @$t[$page-1] = ( $type eq "index" ? "rgb" : $type ); + @$t[$page-1] = "rgb"; # Default is color + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); + @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); } } return $i+1; -- libgit2 0.21.2