Commit 740cf3943a002c48823f498675e6b2927e4b90b3
1 parent
7fe23df8
Exists in
master
and in
1 other branch
Correções afetas e melhorias.
Showing
2 changed files
with
41 additions
and
13 deletions
Show diff stats
INSTALL.txt
| @@ -20,10 +20,16 @@ | @@ -20,10 +20,16 @@ | ||
| 20 | # 0.7 Solved an issue with files with more than 1000 pages | 20 | # 0.7 Solved an issue with files with more than 1000 pages |
| 21 | # 1.0 First release version | 21 | # 1.0 First release version |
| 22 | # 1.0.1 Solving error when file has no images | 22 | # 1.0.1 Solving error when file has no images |
| 23 | -# | ||
| 24 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | ||
| 25 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | ||
| 26 | -# diferently but does not treat it adequately | 23 | +# 1.0.2 Fix bug when counting cores for AMD processors |
| 24 | +# 1.0.3 Added better image type detection | ||
| 25 | +# | ||
| 26 | +# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | ||
| 27 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | ||
| 28 | +# diferently but does not treat it adequately | ||
| 29 | +# - Review poppler and cpdf install instructions | ||
| 30 | +# - Add better handling of vectorized and non scanned pdf files | ||
| 31 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | ||
| 32 | +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | ||
| 27 | # | 33 | # |
| 28 | # Check software requirements on the comments bellow | 34 | # Check software requirements on the comments bellow |
| 29 | # | 35 | # |
usr/local/bin/ocr
| @@ -22,10 +22,16 @@ | @@ -22,10 +22,16 @@ | ||
| 22 | # 0.7 Solved an issue with files with more than 1000 pages | 22 | # 0.7 Solved an issue with files with more than 1000 pages |
| 23 | # 1.0 First release version | 23 | # 1.0 First release version |
| 24 | # 1.0.1 Solving error when file has no images | 24 | # 1.0.1 Solving error when file has no images |
| 25 | +# 1.0.2 Fix bug when counting cores for AMD processors | ||
| 26 | +# 1.0.3 Added better image type detection | ||
| 25 | # | 27 | # |
| 26 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | 28 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
| 27 | # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | 29 | # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them |
| 28 | # diferently but does not treat it adequately | 30 | # diferently but does not treat it adequately |
| 31 | +# - Review poppler and cpdf install instructions | ||
| 32 | +# - Add better handling of vectorized and non scanned pdf files | ||
| 33 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | ||
| 34 | +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | ||
| 29 | # | 35 | # |
| 30 | # Check software requirements on the comments bellow | 36 | # Check software requirements on the comments bellow |
| 31 | # | 37 | # |
| @@ -48,10 +54,11 @@ use IPC::Open3; | @@ -48,10 +54,11 @@ use IPC::Open3; | ||
| 48 | use IO::Select; | 54 | use IO::Select; |
| 49 | 55 | ||
| 50 | my $DEBUG = 0; | 56 | my $DEBUG = 0; |
| 51 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep CPU | wc -l`); | 57 | +my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); |
| 52 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; | 58 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
| 53 | 59 | ||
| 54 | my $USER = 'ocr'; | 60 | my $USER = 'ocr'; |
| 61 | +my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it can be converted to gray scale or B&W | ||
| 55 | 62 | ||
| 56 | # Command dependencies | 63 | # Command dependencies |
| 57 | 64 | ||
| @@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | @@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | ||
| 70 | # Depends on cpdf 2.1 or higher | 77 | # Depends on cpdf 2.1 or higher |
| 71 | my $CPDF = '/usr/local/bin/cpdf'; | 78 | my $CPDF = '/usr/local/bin/cpdf'; |
| 72 | 79 | ||
| 73 | -# Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner | ||
| 74 | -#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | ||
| 75 | - | 80 | +## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner |
| 76 | my $CONVERT = '/usr/bin/convert'; | 81 | my $CONVERT = '/usr/bin/convert'; |
| 77 | 82 | ||
| 83 | +# If it is needed further filtering | ||
| 84 | +#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | ||
| 85 | + | ||
| 78 | my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | 86 | my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', |
| 79 | '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); | 87 | '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); |
| 80 | 88 | ||
| @@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados | @@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados | ||
| 83 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); | 91 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
| 84 | %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); | 92 | %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); |
| 85 | 93 | ||
| 94 | +# Safeguard im case of cpuinfo has not identified correctly the number of CPUs | ||
| 95 | +$MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; | ||
| 86 | 96 | ||
| 87 | $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; | 97 | $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; |
| 88 | my ($host) = split/\./,hostname; | 98 | my ($host) = split/\./,hostname; |
| @@ -355,6 +365,18 @@ sub ocr { | @@ -355,6 +365,18 @@ sub ocr { | ||
| 355 | 365 | ||
| 356 | foreach my $image (@images) { | 366 | foreach my $image (@images) { |
| 357 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; | 367 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
| 368 | + | ||
| 369 | + # Check if image can be safely colour reduced | ||
| 370 | + if ($CHECK_COLOR) { | ||
| 371 | + $cmd = "${CONVERT} ${image} \Q(\E -clone 0 -colorspace gray \Q)\E -compose difference -composite -separate -evaluate-sequence mean -threshold 4% -format \"%[fx:mean]\" info:"; | ||
| 372 | + ($exit, $cmd, @out, @err) = exec_cmd ($cmd); | ||
| 373 | + if ($DEBUG) { | ||
| 374 | + print "\t\t\t${image}-> ${cmd}: $exit\n"; | ||
| 375 | + #print "\t\t\t\t$_" for @out ; | ||
| 376 | + print "\t\t\t\t$_" for @err ; | ||
| 377 | + print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; | ||
| 378 | + } | ||
| 379 | + } | ||
| 358 | 380 | ||
| 359 | # Check if page was rotated | 381 | # Check if page was rotated |
| 360 | if ($pg_r[$i]) { | 382 | if ($pg_r[$i]) { |
| @@ -470,7 +492,7 @@ sub get_pages { | @@ -470,7 +492,7 @@ sub get_pages { | ||
| 470 | 492 | ||
| 471 | sub get_imgs { | 493 | sub get_imgs { |
| 472 | my ($in_file, $page_img, $w, $h, $t) = @_; | 494 | my ($in_file, $page_img, $w, $h, $t) = @_; |
| 473 | - my ($dumb, $i, $page, $width, $height, $type); | 495 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); |
| 474 | 496 | ||
| 475 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); | 497 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
| 476 | 498 | ||
| @@ -478,13 +500,13 @@ sub get_imgs { | @@ -478,13 +500,13 @@ sub get_imgs { | ||
| 478 | chomp $line; | 500 | chomp $line; |
| 479 | $line =~ s/^ {1,}//; | 501 | $line =~ s/^ {1,}//; |
| 480 | if ( $line =~ /image|mask/ ) { | 502 | if ( $line =~ /image|mask/ ) { |
| 481 | - ($page, $i , $dumb, $width, $height, $type) = split / {1,}/,$line; | 503 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; |
| 482 | @$page_img[$page-1]=$i; | 504 | @$page_img[$page-1]=$i; |
| 483 | @$w[$page-1] = $width; | 505 | @$w[$page-1] = $width; |
| 484 | @$h[$page-1] = $height; | 506 | @$h[$page-1] = $height; |
| 485 | - @$t[$page-1] = ( $type eq "-" ? "rgb" : $type ); | ||
| 486 | - @$t[$page-1] = ( $type eq "icc" ? "rgb" : $type ); | ||
| 487 | - @$t[$page-1] = ( $type eq "index" ? "rgb" : $type ); | 507 | + @$t[$page-1] = "rgb"; # Default is color |
| 508 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | ||
| 509 | + @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); | ||
| 488 | } | 510 | } |
| 489 | } | 511 | } |
| 490 | return $i+1; | 512 | return $i+1; |