Commit 740cf3943a002c48823f498675e6b2927e4b90b3
1 parent
7fe23df8
Exists in
master
Correções afetas e melhorias.
Showing
2 changed files
with
41 additions
and
13 deletions
Show diff stats
INSTALL.txt
@@ -20,10 +20,16 @@ | @@ -20,10 +20,16 @@ | ||
20 | # 0.7 Solved an issue with files with more than 1000 pages | 20 | # 0.7 Solved an issue with files with more than 1000 pages |
21 | # 1.0 First release version | 21 | # 1.0 First release version |
22 | # 1.0.1 Solving error when file has no images | 22 | # 1.0.1 Solving error when file has no images |
23 | -# | ||
24 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | ||
25 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | ||
26 | -# diferently but does not treat it adequately | 23 | +# 1.0.2 Fix bug when counting cores for AMD processors |
24 | +# 1.0.3 Added better image type detection | ||
25 | +# | ||
26 | +# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | ||
27 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | ||
28 | +# diferently but does not treat it adequately | ||
29 | +# - Review poppler and cpdf install instructions | ||
30 | +# - Add better handling of vectorized and non scanned pdf files | ||
31 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | ||
32 | +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | ||
27 | # | 33 | # |
28 | # Check software requirements on the comments bellow | 34 | # Check software requirements on the comments bellow |
29 | # | 35 | # |
usr/local/bin/ocr
@@ -22,10 +22,16 @@ | @@ -22,10 +22,16 @@ | ||
22 | # 0.7 Solved an issue with files with more than 1000 pages | 22 | # 0.7 Solved an issue with files with more than 1000 pages |
23 | # 1.0 First release version | 23 | # 1.0 First release version |
24 | # 1.0.1 Solving error when file has no images | 24 | # 1.0.1 Solving error when file has no images |
25 | +# 1.0.2 Fix bug when counting cores for AMD processors | ||
26 | +# 1.0.3 Added better image type detection | ||
25 | # | 27 | # |
26 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | 28 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
27 | # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | 29 | # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them |
28 | # diferently but does not treat it adequately | 30 | # diferently but does not treat it adequately |
31 | +# - Review poppler and cpdf install instructions | ||
32 | +# - Add better handling of vectorized and non scanned pdf files | ||
33 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | ||
34 | +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | ||
29 | # | 35 | # |
30 | # Check software requirements on the comments bellow | 36 | # Check software requirements on the comments bellow |
31 | # | 37 | # |
@@ -48,10 +54,11 @@ use IPC::Open3; | @@ -48,10 +54,11 @@ use IPC::Open3; | ||
48 | use IO::Select; | 54 | use IO::Select; |
49 | 55 | ||
50 | my $DEBUG = 0; | 56 | my $DEBUG = 0; |
51 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep CPU | wc -l`); | 57 | +my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); |
52 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; | 58 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
53 | 59 | ||
54 | my $USER = 'ocr'; | 60 | my $USER = 'ocr'; |
61 | +my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it can be converted to gray scale or B&W | ||
55 | 62 | ||
56 | # Command dependencies | 63 | # Command dependencies |
57 | 64 | ||
@@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | @@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm'; | ||
70 | # Depends on cpdf 2.1 or higher | 77 | # Depends on cpdf 2.1 or higher |
71 | my $CPDF = '/usr/local/bin/cpdf'; | 78 | my $CPDF = '/usr/local/bin/cpdf'; |
72 | 79 | ||
73 | -# Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner | ||
74 | -#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | ||
75 | - | 80 | +## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner |
76 | my $CONVERT = '/usr/bin/convert'; | 81 | my $CONVERT = '/usr/bin/convert'; |
77 | 82 | ||
83 | +# If it is needed further filtering | ||
84 | +#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | ||
85 | + | ||
78 | my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', | 86 | my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', |
79 | '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); | 87 | '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); |
80 | 88 | ||
@@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados | @@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados | ||
83 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); | 91 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
84 | %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); | 92 | %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); |
85 | 93 | ||
94 | +# Safeguard im case of cpuinfo has not identified correctly the number of CPUs | ||
95 | +$MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; | ||
86 | 96 | ||
87 | $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; | 97 | $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; |
88 | my ($host) = split/\./,hostname; | 98 | my ($host) = split/\./,hostname; |
@@ -355,6 +365,18 @@ sub ocr { | @@ -355,6 +365,18 @@ sub ocr { | ||
355 | 365 | ||
356 | foreach my $image (@images) { | 366 | foreach my $image (@images) { |
357 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; | 367 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
368 | + | ||
369 | + # Check if image can be safely colour reduced | ||
370 | + if ($CHECK_COLOR) { | ||
371 | + $cmd = "${CONVERT} ${image} \Q(\E -clone 0 -colorspace gray \Q)\E -compose difference -composite -separate -evaluate-sequence mean -threshold 4% -format \"%[fx:mean]\" info:"; | ||
372 | + ($exit, $cmd, @out, @err) = exec_cmd ($cmd); | ||
373 | + if ($DEBUG) { | ||
374 | + print "\t\t\t${image}-> ${cmd}: $exit\n"; | ||
375 | + #print "\t\t\t\t$_" for @out ; | ||
376 | + print "\t\t\t\t$_" for @err ; | ||
377 | + print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; | ||
378 | + } | ||
379 | + } | ||
358 | 380 | ||
359 | # Check if page was rotated | 381 | # Check if page was rotated |
360 | if ($pg_r[$i]) { | 382 | if ($pg_r[$i]) { |
@@ -470,7 +492,7 @@ sub get_pages { | @@ -470,7 +492,7 @@ sub get_pages { | ||
470 | 492 | ||
471 | sub get_imgs { | 493 | sub get_imgs { |
472 | my ($in_file, $page_img, $w, $h, $t) = @_; | 494 | my ($in_file, $page_img, $w, $h, $t) = @_; |
473 | - my ($dumb, $i, $page, $width, $height, $type); | 495 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); |
474 | 496 | ||
475 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); | 497 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
476 | 498 | ||
@@ -478,13 +500,13 @@ sub get_imgs { | @@ -478,13 +500,13 @@ sub get_imgs { | ||
478 | chomp $line; | 500 | chomp $line; |
479 | $line =~ s/^ {1,}//; | 501 | $line =~ s/^ {1,}//; |
480 | if ( $line =~ /image|mask/ ) { | 502 | if ( $line =~ /image|mask/ ) { |
481 | - ($page, $i , $dumb, $width, $height, $type) = split / {1,}/,$line; | 503 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; |
482 | @$page_img[$page-1]=$i; | 504 | @$page_img[$page-1]=$i; |
483 | @$w[$page-1] = $width; | 505 | @$w[$page-1] = $width; |
484 | @$h[$page-1] = $height; | 506 | @$h[$page-1] = $height; |
485 | - @$t[$page-1] = ( $type eq "-" ? "rgb" : $type ); | ||
486 | - @$t[$page-1] = ( $type eq "icc" ? "rgb" : $type ); | ||
487 | - @$t[$page-1] = ( $type eq "index" ? "rgb" : $type ); | 507 | + @$t[$page-1] = "rgb"; # Default is color |
508 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | ||
509 | + @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); | ||
488 | } | 510 | } |
489 | } | 511 | } |
490 | return $i+1; | 512 | return $i+1; |