Commit 740cf3943a002c48823f498675e6b2927e4b90b3

Authored by Nei Jobson da Costa Carneiro
1 parent 7fe23df8
Exists in master

Correções afetas e melhorias.

Showing 2 changed files with 41 additions and 13 deletions   Show diff stats
@@ -20,10 +20,16 @@ @@ -20,10 +20,16 @@
20 # 0.7 Solved an issue with files with more than 1000 pages 20 # 0.7 Solved an issue with files with more than 1000 pages
21 # 1.0 First release version 21 # 1.0 First release version
22 # 1.0.1 Solving error when file has no images 22 # 1.0.1 Solving error when file has no images
23 -#  
24 -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it  
25 -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them  
26 -# diferently but does not treat it adequately 23 +# 1.0.2 Fix bug when counting cores for AMD processors
  24 +# 1.0.3 Added better image type detection
  25 +#
  26 +# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
  27 +# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
  28 +# diferently but does not treat it adequately
  29 +# - Review poppler and cpdf install instructions
  30 +# - Add better handling of vectorized and non scanned pdf files
  31 +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
  32 +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
27 # 33 #
28 # Check software requirements on the comments bellow 34 # Check software requirements on the comments bellow
29 # 35 #
usr/local/bin/ocr
@@ -22,10 +22,16 @@ @@ -22,10 +22,16 @@
22 # 0.7 Solved an issue with files with more than 1000 pages 22 # 0.7 Solved an issue with files with more than 1000 pages
23 # 1.0 First release version 23 # 1.0 First release version
24 # 1.0.1 Solving error when file has no images 24 # 1.0.1 Solving error when file has no images
  25 +# 1.0.2 Fix bug when counting cores for AMD processors
  26 +# 1.0.3 Added better image type detection
25 # 27 #
26 # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it 28 # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it
27 # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them 29 # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them
28 # diferently but does not treat it adequately 30 # diferently but does not treat it adequately
  31 +# - Review poppler and cpdf install instructions
  32 +# - Add better handling of vectorized and non scanned pdf files
  33 +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core)
  34 +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible
29 # 35 #
30 # Check software requirements on the comments bellow 36 # Check software requirements on the comments bellow
31 # 37 #
@@ -48,10 +54,11 @@ use IPC::Open3; @@ -48,10 +54,11 @@ use IPC::Open3;
48 use IO::Select; 54 use IO::Select;
49 55
50 my $DEBUG = 0; 56 my $DEBUG = 0;
51 -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep CPU | wc -l`); 57 +my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`);
52 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; 58 my $MAX_FILES = ( !$DEBUG ? 2 : 1) ;
53 59
54 my $USER = 'ocr'; 60 my $USER = 'ocr';
  61 +my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it can be converted to gray scale or B&W
55 62
56 # Command dependencies 63 # Command dependencies
57 64
@@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm'; @@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm';
70 # Depends on cpdf 2.1 or higher 77 # Depends on cpdf 2.1 or higher
71 my $CPDF = '/usr/local/bin/cpdf'; 78 my $CPDF = '/usr/local/bin/cpdf';
72 79
73 -# Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner  
74 -#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';  
75 - 80 +## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner
76 my $CONVERT = '/usr/bin/convert'; 81 my $CONVERT = '/usr/bin/convert';
77 82
  83 +# If it is needed further filtering
  84 +#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 ';
  85 +
78 my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', 86 my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/',
79 '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); 87 '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' );
80 88
@@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados @@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados
83 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); 91 @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2);
84 %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); 92 %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG);
85 93
  94 +# Safeguard im case of cpuinfo has not identified correctly the number of CPUs
  95 +$MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS;
86 96
87 $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; 97 $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin';
88 my ($host) = split/\./,hostname; 98 my ($host) = split/\./,hostname;
@@ -355,6 +365,18 @@ sub ocr { @@ -355,6 +365,18 @@ sub ocr {
355 365
356 foreach my $image (@images) { 366 foreach my $image (@images) {
357 print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; 367 print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG;
  368 +
  369 + # Check if image can be safely colour reduced
  370 + if ($CHECK_COLOR) {
  371 + $cmd = "${CONVERT} ${image} \Q(\E -clone 0 -colorspace gray \Q)\E -compose difference -composite -separate -evaluate-sequence mean -threshold 4% -format \"%[fx:mean]\" info:";
  372 + ($exit, $cmd, @out, @err) = exec_cmd ($cmd);
  373 + if ($DEBUG) {
  374 + print "\t\t\t${image}-> ${cmd}: $exit\n";
  375 + #print "\t\t\t\t$_" for @out ;
  376 + print "\t\t\t\t$_" for @err ;
  377 + print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n";
  378 + }
  379 + }
358 380
359 # Check if page was rotated 381 # Check if page was rotated
360 if ($pg_r[$i]) { 382 if ($pg_r[$i]) {
@@ -470,7 +492,7 @@ sub get_pages { @@ -470,7 +492,7 @@ sub get_pages {
470 492
471 sub get_imgs { 493 sub get_imgs {
472 my ($in_file, $page_img, $w, $h, $t) = @_; 494 my ($in_file, $page_img, $w, $h, $t) = @_;
473 - my ($dumb, $i, $page, $width, $height, $type); 495 + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc);
474 496
475 my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); 497 my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\"");
476 498
@@ -478,13 +500,13 @@ sub get_imgs { @@ -478,13 +500,13 @@ sub get_imgs {
478 chomp $line; 500 chomp $line;
479 $line =~ s/^ {1,}//; 501 $line =~ s/^ {1,}//;
480 if ( $line =~ /image|mask/ ) { 502 if ( $line =~ /image|mask/ ) {
481 - ($page, $i , $dumb, $width, $height, $type) = split / {1,}/,$line; 503 + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line;
482 @$page_img[$page-1]=$i; 504 @$page_img[$page-1]=$i;
483 @$w[$page-1] = $width; 505 @$w[$page-1] = $width;
484 @$h[$page-1] = $height; 506 @$h[$page-1] = $height;
485 - @$t[$page-1] = ( $type eq "-" ? "rgb" : $type );  
486 - @$t[$page-1] = ( $type eq "icc" ? "rgb" : $type );  
487 - @$t[$page-1] = ( $type eq "index" ? "rgb" : $type ); 507 + @$t[$page-1] = "rgb"; # Default is color
  508 + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]);
  509 + @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]);
488 } 510 }
489 } 511 }
490 return $i+1; 512 return $i+1;