Commit 740cf3943a002c48823f498675e6b2927e4b90b3
1 parent
7fe23df8
Exists in
master
Correções afetas e melhorias.
Showing
2 changed files
with
41 additions
and
13 deletions
Show diff stats
INSTALL.txt
... | ... | @@ -20,10 +20,16 @@ |
20 | 20 | # 0.7 Solved an issue with files with more than 1000 pages |
21 | 21 | # 1.0 First release version |
22 | 22 | # 1.0.1 Solving error when file has no images |
23 | -# | |
24 | -# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | |
25 | -# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
26 | -# diferently but does not treat it adequately | |
23 | +# 1.0.2 Fix bug when counting cores for AMD processors | |
24 | +# 1.0.3 Added better image type detection | |
25 | +# | |
26 | +# TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it | |
27 | +# would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them | |
28 | +# diferently but does not treat it adequately | |
29 | +# - Review poppler and cpdf install instructions | |
30 | +# - Add better handling of vectorized and non scanned pdf files | |
31 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
32 | +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
27 | 33 | # |
28 | 34 | # Check software requirements on the comments bellow |
29 | 35 | # | ... | ... |
usr/local/bin/ocr
... | ... | @@ -22,10 +22,16 @@ |
22 | 22 | # 0.7 Solved an issue with files with more than 1000 pages |
23 | 23 | # 1.0 First release version |
24 | 24 | # 1.0.1 Solving error when file has no images |
25 | +# 1.0.2 Fix bug when counting cores for AMD processors | |
26 | +# 1.0.3 Added better image type detection | |
25 | 27 | # |
26 | 28 | # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it |
27 | 29 | # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them |
28 | 30 | # diferently but does not treat it adequately |
31 | +# - Review poppler and cpdf install instructions | |
32 | +# - Add better handling of vectorized and non scanned pdf files | |
33 | +# - Add option to generate multi-page tiff files to reduce overhead (one for each CPU core) | |
34 | +# - Check mean saturation for addiotional colored images detection and automatically convert to B&W if possible | |
29 | 35 | # |
30 | 36 | # Check software requirements on the comments bellow |
31 | 37 | # |
... | ... | @@ -48,10 +54,11 @@ use IPC::Open3; |
48 | 54 | use IO::Select; |
49 | 55 | |
50 | 56 | my $DEBUG = 0; |
51 | -my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep CPU | wc -l`); | |
57 | +my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep -e '^processor' | wc -l`); | |
52 | 58 | my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; |
53 | 59 | |
54 | 60 | my $USER = 'ocr'; |
61 | +my $CHECK_COLOR = 0; # If it has to check if image is reaaly colored or if it can be converted to gray scale or B&W | |
55 | 62 | |
56 | 63 | # Command dependencies |
57 | 64 | |
... | ... | @@ -70,11 +77,12 @@ my $PDFTOPPM = '/usr/local/bin/pdftoppm'; |
70 | 77 | # Depends on cpdf 2.1 or higher |
71 | 78 | my $CPDF = '/usr/local/bin/cpdf'; |
72 | 79 | |
73 | -# Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner | |
74 | -#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | |
75 | - | |
80 | +## Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner | |
76 | 81 | my $CONVERT = '/usr/bin/convert'; |
77 | 82 | |
83 | +# If it is needed further filtering | |
84 | +#my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; | |
85 | + | |
78 | 86 | my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', |
79 | 87 | '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); |
80 | 88 | |
... | ... | @@ -83,6 +91,8 @@ my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados |
83 | 91 | @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); |
84 | 92 | %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); |
85 | 93 | |
94 | +# Safeguard im case of cpuinfo has not identified correctly the number of CPUs | |
95 | +$MAX_PGS = ($MAX_PGS==0) ? 4 : $MAX_PGS; | |
86 | 96 | |
87 | 97 | $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; |
88 | 98 | my ($host) = split/\./,hostname; |
... | ... | @@ -355,6 +365,18 @@ sub ocr { |
355 | 365 | |
356 | 366 | foreach my $image (@images) { |
357 | 367 | print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; |
368 | + | |
369 | + # Check if image can be safely colour reduced | |
370 | + if ($CHECK_COLOR) { | |
371 | + $cmd = "${CONVERT} ${image} \Q(\E -clone 0 -colorspace gray \Q)\E -compose difference -composite -separate -evaluate-sequence mean -threshold 4% -format \"%[fx:mean]\" info:"; | |
372 | + ($exit, $cmd, @out, @err) = exec_cmd ($cmd); | |
373 | + if ($DEBUG) { | |
374 | + print "\t\t\t${image}-> ${cmd}: $exit\n"; | |
375 | + #print "\t\t\t\t$_" for @out ; | |
376 | + print "\t\t\t\t$_" for @err ; | |
377 | + print "\t\t\t\tImage is probably " . ( $out[0]<0.1 ? "B&W" : "Colored") . " (mean saturation: $out[0])\n"; | |
378 | + } | |
379 | + } | |
358 | 380 | |
359 | 381 | # Check if page was rotated |
360 | 382 | if ($pg_r[$i]) { |
... | ... | @@ -470,7 +492,7 @@ sub get_pages { |
470 | 492 | |
471 | 493 | sub get_imgs { |
472 | 494 | my ($in_file, $page_img, $w, $h, $t) = @_; |
473 | - my ($dumb, $i, $page, $width, $height, $type); | |
495 | + my ($dumb, $i, $page, $type, $width, $height, $color, $comp, $bpc, $enc); | |
474 | 496 | |
475 | 497 | my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); |
476 | 498 | |
... | ... | @@ -478,13 +500,13 @@ sub get_imgs { |
478 | 500 | chomp $line; |
479 | 501 | $line =~ s/^ {1,}//; |
480 | 502 | if ( $line =~ /image|mask/ ) { |
481 | - ($page, $i , $dumb, $width, $height, $type) = split / {1,}/,$line; | |
503 | + ($page, $i , $type, $width, $height, $color, $comp, $bpc, $enc) = split / {1,}/,$line; | |
482 | 504 | @$page_img[$page-1]=$i; |
483 | 505 | @$w[$page-1] = $width; |
484 | 506 | @$h[$page-1] = $height; |
485 | - @$t[$page-1] = ( $type eq "-" ? "rgb" : $type ); | |
486 | - @$t[$page-1] = ( $type eq "icc" ? "rgb" : $type ); | |
487 | - @$t[$page-1] = ( $type eq "index" ? "rgb" : $type ); | |
507 | + @$t[$page-1] = "rgb"; # Default is color | |
508 | + @$t[$page-1] = ( $comp == 3 || $bpc > 1 || $enc eq "jpeg" || $color eq "-" || $color eq "icc" ? "rgb" : @$t[$page-1]); | |
509 | + @$t[$page-1] = ( $comp == 1 || $bpc == 1 || $enc eq "ccitt"|| $color eq "gray" || $type eq "mask" ? "gray" : @$t[$page-1]); | |
488 | 510 | } |
489 | 511 | } |
490 | 512 | return $i+1; | ... | ... |