Commit 740cf3943a002c48823f498675e6b2927e4b90b3
1 parent
7fe23df8
Exists in
master
Correções afetas e melhorias.
Showing
2 changed files
with
41 additions
and
13 deletions
Show diff stats
INSTALL.txt
| ... | ... | @@ -20,10 +20,16 @@ |
| 20 | 20 | # 0.7 Solved an issue with files with more than 1000 pages |
| 21 | 21 | # 1.0 First release version |
| 22 | 22 | # 1.0.1 Solving error when file has no images |
| 23 | -# | |
| 24 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | |
| 25 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
| 26 | -# diferently but does not treat it adequately | |
| 23 | +# 1.0.2 Fix bug when counting cores for AMD processors | |
| 24 | +# 1.0.3 Added better image type detection | |
| 25 | +# | |
| 26 | +# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | |
| 27 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
| 28 | +# diferently but does not treat it adequately | |
| 29 | +# - Review poppler and cpdf install instructions | |
| 30 | +# - Add better handling of vectorized and non scanned pdf files | |
| 31 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
| 32 | +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
| 27 | 33 | # |
| 28 | 34 | # Check software requirements on the comments bellow |
| 29 | 35 | # | ... | ... |
usr/local/bin/ocr
| ... | ... | @@ -22,10 +22,16 @@ |
| 22 | 22 | # 0.7 Solved an issue with files with more than 1000 pages |
| 23 | 23 | # 1.0 First release version |
| 24 | 24 | # 1.0.1 Solving error when file has no images |
| 25 | +# 1.0.2 Fix bug when counting cores for AMD processors | |
| 26 | +# 1.0.3 Added better image type detection | |
| 25 | 27 | # |
| 26 | 28 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
| 27 | 29 | # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them |
| 28 | 30 | # diferently but does not treat it adequately |
| 31 | +# - Review poppler and cpdf install instructions | |
| 32 | +# - Add better handling of vectorized and non scanned pdf files | |
| 33 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
| 34 | +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
| 29 | 35 | # |
| 30 | 36 | # Check software requirements on the comments bellow |
| 31 | 37 | # |
| ... | ... | @@ -48,10 +54,11 @@ use IPC::Open3; |
| 48 | 54 | use IO::Select; |
| 49 | 55 | |
| 50 | 56 | my $DEBUG = 0; |
| 51 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep CPU | wc -l`); | |
| 57 | +my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | |
| 52 | 58 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
| 53 | 59 | |
| 54 | 60 | my $USER = 'ocr'; |
| 61 | +my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it can be converted to gray scale or B&W | |
| 55 | 62 | |
| 56 | 63 | # Command dependencies |
| 57 | 64 | |
| ... | ... | @@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm'; |
| 70 | 77 | # Depends on cpdf 2.1 or higher |
| 71 | 78 | my $CPDF = '/usr/local/bin/cpdf'; |
| 72 | 79 | |
| 73 | -# Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner | |
| 74 | -#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | |
| 75 | - | |
| 80 | +## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner | |
| 76 | 81 | my $CONVERT = '/usr/bin/convert'; |
| 77 | 82 | |
| 83 | +# If it is needed further filtering | |
| 84 | +#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | |
| 85 | + | |
| 78 | 86 | my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', |
| 79 | 87 | '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); |
| 80 | 88 | |
| ... | ... | @@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados |
| 83 | 91 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
| 84 | 92 | %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); |
| 85 | 93 | |
| 94 | +# Safeguard im case of cpuinfo has not identified correctly the number of CPUs | |
| 95 | +$MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; | |
| 86 | 96 | |
| 87 | 97 | $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; |
| 88 | 98 | my ($host) = split/\./,hostname; |
| ... | ... | @@ -355,6 +365,18 @@ sub ocr { |
| 355 | 365 | |
| 356 | 366 | foreach my $image (@images) { |
| 357 | 367 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
| 368 | + | |
| 369 | + # Check if image can be safely colour reduced | |
| 370 | + if ($CHECK_COLOR) { | |
| 371 | + $cmd = "${CONVERT} ${image} \Q(\E -clone 0 -colorspace gray \Q)\E -compose difference -composite -separate -evaluate-sequence mean -threshold 4% -format \"%[fx:mean]\" info:"; | |
| 372 | + ($exit, $cmd, @out, @err) = exec_cmd ($cmd); | |
| 373 | + if ($DEBUG) { | |
| 374 | + print "\t\t\t${image}-> ${cmd}: $exit\n"; | |
| 375 | + #print "\t\t\t\t$_" for @out ; | |
| 376 | + print "\t\t\t\t$_" for @err ; | |
| 377 | + print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; | |
| 378 | + } | |
| 379 | + } | |
| 358 | 380 | |
| 359 | 381 | # Check if page was rotated |
| 360 | 382 | if ($pg_r[$i]) { |
| ... | ... | @@ -470,7 +492,7 @@ sub get_pages { |
| 470 | 492 | |
| 471 | 493 | sub get_imgs { |
| 472 | 494 | my ($in_file, $page_img, $w, $h, $t) = @_; |
| 473 | - my ($dumb, $i, $page, $width, $height, $type); | |
| 495 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); | |
| 474 | 496 | |
| 475 | 497 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
| 476 | 498 | |
| ... | ... | @@ -478,13 +500,13 @@ sub get_imgs { |
| 478 | 500 | chomp $line; |
| 479 | 501 | $line =~ s/^ {1,}//; |
| 480 | 502 | if ( $line =~ /image|mask/ ) { |
| 481 | - ($page, $i , $dumb, $width, $height, $type) = split / {1,}/,$line; | |
| 503 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; | |
| 482 | 504 | @$page_img[$page-1]=$i; |
| 483 | 505 | @$w[$page-1] = $width; |
| 484 | 506 | @$h[$page-1] = $height; |
| 485 | - @$t[$page-1] = ( $type eq "-" ? "rgb" : $type ); | |
| 486 | - @$t[$page-1] = ( $type eq "icc" ? "rgb" : $type ); | |
| 487 | - @$t[$page-1] = ( $type eq "index" ? "rgb" : $type ); | |
| 507 | + @$t[$page-1] = "rgb"; # Default is color | |
| 508 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | |
| 509 | + @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); | |
| 488 | 510 | } |
| 489 | 511 | } |
| 490 | 512 | return $i+1; | ... | ... |