Commit 740cf3943a002c48823f498675e6b2927e4b90b3

Authored by Nei Jobson da Costa Carneiro
1 parent 7fe23df8
Exists in master

Correções afetas e melhorias.

Showing 2 changed files with 41 additions and 13 deletions   Show diff stats
INSTALL.txt
... ... @@ -20,10 +20,16 @@
20 20 # 0.7 Solved an issue with files with more than 1000 pages
21 21 # 1.0 First release version
22 22 # 1.0.1 Solving error when file has no images
23   -#
24   -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
25   -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
26   -# diferently but does not treat it adequately
  23 +# 1.0.2 Fix bug when counting cores for AMD processors
  24 +# 1.0.3 Added better image type detection
  25 +#
  26 +# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
  27 +# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
  28 +# diferently but does not treat it adequately
  29 +# - Review poppler and cpdf install instructions
  30 +# - Add better handling of vectorized and non scanned pdf files
  31 +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
  32 +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
27 33 #
28 34 # Check software requirements on the comments bellow
29 35 #
... ...
usr/local/bin/ocr
... ... @@ -22,10 +22,16 @@
22 22 # 0.7 Solved an issue with files with more than 1000 pages
23 23 # 1.0 First release version
24 24 # 1.0.1 Solving error when file has no images
  25 +# 1.0.2 Fix bug when counting cores for AMD processors
  26 +# 1.0.3 Added better image type detection
25 27 #
26 28 # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
27 29 # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
28 30 # diferently but does not treat it adequately
  31 +# - Review poppler and cpdf install instructions
  32 +# - Add better handling of vectorized and non scanned pdf files
  33 +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
  34 +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
29 35 #
30 36 # Check software requirements on the comments bellow
31 37 #
... ... @@ -48,10 +54,11 @@ use IPC::Open3;
48 54 use IO::Select;
49 55  
50 56 my $DEBUG = 0;
51   -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep CPU | wc -l`);
  57 +my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`);
52 58 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ;
53 59  
54 60 my $USER = 'ocr';
  61 +my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it can be converted to gray scale or B&W
55 62  
56 63 # Command dependencies
57 64  
... ... @@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm';
70 77 # Depends on cpdf 2.1 or higher
71 78 my $CPDF = '/usr/local/bin/cpdf';
72 79  
73   -# Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner
74   -#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';
75   -
  80 +## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner
76 81 my $CONVERT = '/usr/bin/convert';
77 82  
  83 +# If it is needed further filtering
  84 +#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';
  85 +
78 86 my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/',
79 87 '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' );
80 88  
... ... @@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados
83 91 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2);
84 92 %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG);
85 93  
  94 +# Safeguard im case of cpuinfo has not identified correctly the number of CPUs
  95 +$MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS;
86 96  
87 97 $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin';
88 98 my ($host) = split/\./,hostname;
... ... @@ -355,6 +365,18 @@ sub ocr {
355 365  
356 366 foreach my $image (@images) {
357 367 print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG;
  368 +
  369 + # Check if image can be safely colour reduced
  370 + if ($CHECK_COLOR) {
  371 + $cmd = "${CONVERT} ${image} \Q(\E -clone 0 -colorspace gray \Q)\E -compose difference -composite -separate -evaluate-sequence mean -threshold 4% -format \"%[fx:mean]\" info:";
  372 + ($exit, $cmd, @out, @err) = exec_cmd ($cmd);
  373 + if ($DEBUG) {
  374 + print "\t\t\t${image}-> ${cmd}: $exit\n";
  375 + #print "\t\t\t\t$_" for @out ;
  376 + print "\t\t\t\t$_" for @err ;
  377 + print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n";
  378 + }
  379 + }
358 380  
359 381 # Check if page was rotated
360 382 if ($pg_r[$i]) {
... ... @@ -470,7 +492,7 @@ sub get_pages {
470 492  
471 493 sub get_imgs {
472 494 my ($in_file, $page_img, $w, $h, $t) = @_;
473   - my ($dumb, $i, $page, $width, $height, $type);
  495 + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc);
474 496  
475 497 my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\"");
476 498  
... ... @@ -478,13 +500,13 @@ sub get_imgs {
478 500 chomp $line;
479 501 $line =~ s/^ {1,}//;
480 502 if ( $line =~ /image|mask/ ) {
481   - ($page, $i , $dumb, $width, $height, $type) = split / {1,}/,$line;
  503 + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line;
482 504 @$page_img[$page-1]=$i;
483 505 @$w[$page-1] = $width;
484 506 @$h[$page-1] = $height;
485   - @$t[$page-1] = ( $type eq "-" ? "rgb" : $type );
486   - @$t[$page-1] = ( $type eq "icc" ? "rgb" : $type );
487   - @$t[$page-1] = ( $type eq "index" ? "rgb" : $type );
  507 + @$t[$page-1] = "rgb"; # Default is color
  508 + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]);
  509 + @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]);
488 510 }
489 511 }
490 512 return $i+1;
... ...