#! /usr/bin/perl -w # # OCR Server 1.0.1 - (c) Agencia Nacional de Telecomunicacoes # # This script monitors a set of input directories for PDF files # once a new file is detected, it is processes through tesseract OCR # in order to generate a new file with a hidden searchable text layer # # It may be distributed under the conditions of the LGPL v2.1 license. # # Author: Guilherme Chehab # # Version History: # 0.1 Initial single server version # 0.2 Check if page already has the html hidden layer, if so, ignore it # 0.3 Solved issues about various image enconding types # 0.4 Added a postnormalization step to ensure all output pdf pages have # the same size and orientations as the original files # 0.5 Used input file renaming as a way to sync multiple parallel instances, # that way, it is minimized the risk of same file being OCRed multiple times. # 0.6 Added a default handler for unknown image encoding using jpeg encoding # 0.7 Solved an issue with files with more than 1000 pages # 1.0 First release version # 1.0.1 Solving error when file has no images # # TODO: - Changes get_imgs and OCR processing to enable pages with more than one image -- it # would not work on previous versions that assumed #pages = #imgs. Version 1.1 counts them # diferently but does not treat it adequately # # Check software requirements on the comments bellow # # To configure input dirs change @BASE_DIRS and @SUB_DIRS variables use warnings; use strict; use File::Find::Rule; use File::Basename; use File::Copy; use File::Path qw (remove_tree make_path); use File::Touch; use Fcntl qw( :flock ); use Sys::Syslog; use POSIX; #use POSIX ":sys_wait_h"; use Sys::Hostname; use IPC::Open3; use IO::Select; my $DEBUG = 0; my $MAX_PGS = ($DEBUG==2 ? 1 : `cat /proc/cpuinfo | grep CPU | wc -l`); my $MAX_FILES = ( !$DEBUG ? 2 : 1) ; my $USER = 'ocr'; # Command dependencies # depends on tesseract-ocr an tesseract-ocr-por 3.05-dev or higher my $TESSERACT = '/usr/local/bin/tesseract -l por+eng'; # Depends on pdftk 2.02 or higher my $PDFTK = '/usr/local/bin/pdftk'; # Depends on poppler-utils 0.42.0 or higher #my $PDINFO = '/usr/local/bin/pdfinfo'; my $PDFFONTS = '/usr/local/bin/pdffonts'; my $PDFIMAGES = '/usr/local/bin/pdfimages'; my $PDFTOPPM = '/usr/local/bin/pdftoppm'; # Depends on cpdf 2.1 or higher my $CPDF = '/usr/local/bin/cpdf'; # Depends on ImageMagick and http://www.fmwconcepts.com/imagemagick/downloadcounter.php?scriptname=textcleaner&dirname=textcleaner #my $FILTER = '/usr/local/bin/textcleaner -g -e stretch -f 25 -o 10 -u -s 1 -T -p 10 '; my $CONVERT = '/usr/bin/convert'; my @BASE_DIRS = ( '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS PROTOCOLO/OCR/', '/mnt/protocolo_sede/DIGITALIZAÇÃO/ARQUIVOS_PROCESSOS/OCR/' ); my %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_tmp', 'ERROR' => 'Erro' ); @BASE_DIRS = ( '/tmp/ocr_dev/') if ($DEBUG==2); %SUB_DIRS = ( 'IN'=>'Entrada', 'OUT'=>'Saida', 'PROC'=>'Originais_Processados', 'TEMP'=>'/tmp/ocr_dev/tmp', 'ERROR' => 'Erro' ) if ($DEBUG); $ENV{'PATH'} = '/usr/local/bin:/bin:/usr/bin'; my ($host) = split/\./,hostname; use vars qw/*name *dir *prune/; *name = *File::Find::name; *dir = *File::Find::dir; *prune = *File::Find::prune; sub main; sub get_pages; sub get_rotation; sub get_res; sub is_ocred; sub is_locked_ex; my $expr = 'use POSIX qw(setsid)'; my ($dumb1, $dumb2, $uid) = getpwnam ($USER); setuid ($uid) or warn "Cant set uid $uid"; $SIG{__DIE__} = 'DEFAULT'; $SIG{__WARN__} = \&die_when_called; #$SIG{CHLD} = 'IGNORE'; eval $expr; if ($@) { die "$0: cannot get POSIX::setsid since eval '$expr' failed: $@\n"; } chdir('/') or die "$0: cannot chdir '/': $!\n"; open(STDIN, '/dev/null') or die "$0: cannot open '/dev/null': $!\n"; foreach my $DIR (@BASE_DIRS) { defined(my $pid = fork) or die "$0: cannot fork: $!\n"; if (!$pid) { #$SIG{CHLD} = 'IGNORE'; POSIX::setsid() or die "$0: cannot start a new session: $!\n"; main ($DIR, $DIR.$SUB_DIRS{IN}, $DIR.$SUB_DIRS{OUT}, $DIR.$SUB_DIRS{PROC}, $SUB_DIRS{TEMP}, $DIR.$SUB_DIRS{ERROR}); exit 0; last; } } exit 0; sub main { my ($DIR, $IN, $OUT, $PROC, $TEMP, $ERRO) = @_; my %files_in; my %pids; my $count=0; $0 = 'ocr_dev' if ($DEBUG); # Clean previous executions -- must be rewritten to permit multiple daemons running on same dir # Remove old temp files remove_tree (${TEMP},{ keep_root=>1 , error=> \my $dumb }); # remove .tmp file unlink ( find ( file => name => qr/\.${host}\.tmp$/i , in => ${IN} ) ); # Rename files that were in 'processig' back foreach my $file ( find ( file => name => qr/\.${host}\.processing$/i , in => ${IN} ) ) { my $old_name = $file; $old_name =~ s/\.${host}\.processing$//g; move ($file, $old_name); } # Open log file openlog ("ocr","ndelay,pid","local0") if !$DEBUG; syslog ("info","OCR started, monitoring: ".$DIR) if (!$DEBUG); $SIG {TERM} = $SIG{HUP} = sub { syslog ("info","OCR stopped monitor: ".$DIR) if (!$DEBUG); exit 0; }; # Main loop while ( 1 ) { select (undef, undef, undef, rand 3); # Random sleep so multiple instances dont get synced $files_in {$_} = (!defined $files_in {$_} ? 1 : $files_in {$_}) for ( find ( file => name => qr/\.pdf$/i , in => ${IN} )); print "\nFound ", scalar keys %files_in, " in $IN\n" if $DEBUG && $count != scalar keys %files_in; $count = scalar keys %files_in; foreach my $file (keys %files_in) { next if ( glob ("$file.*.tmp")); select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds next if (!defined $files_in{$file}); # continue only if it is still valid next if ($files_in{$file}==2); # if in file already is in processing if ($files_in {$file} == 1) { # Skip locked files -- should try again on next iteration if ( is_locked_ex ($file)) { delete $files_in {$file}; next; } # Cleanup deleted pids before forking again while (scalar keys %pids >= $MAX_FILES) { my @ended = child_wait (\%pids); foreach my $ended_pid (@ended) { delete $files_in{$pids{$ended_pid}}; delete $pids{$ended_pid}; } }; if (!glob( "$file.*.tmp")) { # Do nothing with files in processing by another process if (my $pid = fork) { while (! glob ("$file.$host.tmp")) {sleep 1;}; # Wait until temp file is created $files_in{$file} = 2; $pids{$pid}=$file; } else { touch ("$file.$host.tmp"); while (! glob ("$file.$host.tmp")) {sleep 1;}; # Wait until temp file is created select (undef, undef, undef, 1 + rand 2); # sleep between 1 and 3 seconds ocr ( $DIR, $IN, $OUT, $PROC, $TEMP, $ERRO, $file); exit (0); # It is never executed } } else { next; } } } # Cleanup periodicaly foreach my $pid (keys (%pids)) { if (waitpid ($pid,WNOHANG)==-1) { delete $files_in{$pids{$pid}}; delete $pids{$pid}; } } #if ($DEBUG==2) { # child_wait (\%pids); # exit; #} } } sub ocr { my ($DIR, $IN, $OUT, $PROC, $TMP, $ERROR, $in_file) = @_; my ($in_name, $in_path, $in_suffix) = fileparse ($in_file); my ($exit, $cmd, @out,@err); my $tmpdir = $TMP .'/'.$in_name.'.' . $$; touch ("$in_file.$host.tmp"); print "Will ocr $in_file\n" if ($DEBUG); $0 = "ocr $in_name" if(!$DEBUG); $SIG {TERM} = $SIG{HUP} = sub { while (wait () != -1) { sleep 1;}; remove_tree ($tmpdir,{ error=> \my $dumb }); unlink ("$in_file.$host.tmp"); move ( "$in_file.$host.processing", $in_file); exit 0; }; my $out_path = $in_path; $out_path =~ s{${IN}}{$OUT}; my $out_file = $out_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); my $proc_path = $in_path; $proc_path =~ s{${IN}}{$PROC}; my $proc_file = $proc_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); my $error_path = $in_path; $error_path =~ s{${IN}}{$ERROR}; my $error_file = $error_path.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); print "\twritting to $out_file\n" if $DEBUG; my $stime = time; my %pids; if (!move ($in_file, "$in_file.$host.processing")) { unlink ("$in_file.$host.tmp"); exit 0; } sleep 1 if (!$DEBUG); select (undef, undef, undef, 2) if ($DEBUG); # Create temp dir make_path $tmpdir; my $tmp_file = $tmpdir.'/'.$in_name.($in_suffix ne ""? ".".$in_suffix: ""); if (!copy ("$in_file.$host.processing", $tmp_file)) { remove_tree ($tmpdir,{ error=> \my $dumb }); unlink ("$in_file.$host.tmp"); move ( "$in_file.$host.processing", $in_file); }; # Extract pages ($exit, $cmd, @out,@err) = exec_cmd ("${PDFTK} \"${tmp_file}\" burst output \"${tmpdir}\"/pg_\%06d.pdf"); if ($DEBUG) { print "\t\t${tmp_file} -> ${cmd}: $exit\n"; print "\t\t\t$_" for @out ; print "\t\t\t$_" for @err ; }; my ($pages, @pg_w, @pg_h, @pg_r); $pages = get_pages ($tmp_file, \@pg_w, \@pg_h, \@pg_r); my ($imgs,@page_img, @img_w, @img_h, @img_t); $imgs = get_imgs ( $tmp_file, \@page_img, \@img_w, \@img_h, \@img_t); for ( my $i=0; $i< $pages; $i++ ) { my $pg = sprintf ("pg_%06d", $i+1); # Enforce fork limit while (scalar keys %pids >= $MAX_PGS ) { my @ended = child_wait (\%pids); foreach my $ended_pid (@ended) { delete $pids{$ended_pid}; } } if (my $pid=fork) { $pids{$pid}=$pg; } else { $0 = "ocr $in_name (".($i+1)."/$pages)" if(!$DEBUG); if (is_ocred ("${tmpdir}/${pg}.pdf")) { move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); print "\t\t${in_file}: ".(${i}+1)." / $pages: Page already has text layer, ignoring page\n" if $DEBUG; exit 0; } if (! defined $img_t[$i] ) { move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}-cpdf.pdf"); print "\t\t${in_file}: ".(${i}+1)." / $pages: Undefined image type on page, ignoring page\n" if $DEBUG; exit 0; } print "\t\t${in_file}: ".(${i}+1)." / $pages: $pg_w[$i] x $pg_h[$i] - $pg_r[$i] & $img_w[$i] x $img_h[$i], $img_t[$i]\n" if $DEBUG; undef $cmd; if ($img_t[$i] eq "gray") { $cmd = "${PDFIMAGES} -tiff \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; } if ($img_t[$i] eq "rgb") { $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM } if (!defined $cmd) { $cmd = "${PDFTOPPM} -jpeg -scale-to-x $img_w[$i] -scale-to-y $img_h[$i] \"${tmpdir}\"/${pg}.pdf \"${tmpdir}\"/${pg}"; $pg_r[$i] = 0; # Do not rotate if it was extracted with PDFTOPPM } ($exit,$cmd,@out,@err) = exec_cmd($cmd); if ($DEBUG) { print "\t\t\t${pg}.pdf -> ${cmd}: $exit\n"; print "\t\t\t\t$_" for @out ; print "\t\t\t\t$_" for @err ; }; # Process each resulting image for page pdf my @images = ( find ( file => name => qr/${pg}.*\.(jpg|tif)/i , in => ${tmpdir} )) ; foreach my $image (@images) { print "\t\t\t${image}: ".(${i}+1)." / $pages\n" if $DEBUG; # Check if page was rotated if ($pg_r[$i]) { print "\t\t\t${image} unrotate: $pg_r[$i] graus ".(${i}+1)." / $pages\n" if $DEBUG; ($exit,$cmd,@out,@err) = exec_cmd("${CONVERT} \"$image\" -rotate $pg_r[$i] \"$image\""); if ($DEBUG) { print "\t\t\t${image} -> $cmd: $exit\n"; print "\t\t\t\t$_" for @out ; print "\t\t\t\t$_" for @err ; }; } # Filter ppm images, if needed # OCR ppm images to pdf pages ($exit,$cmd, @out,@err) = exec_cmd("${TESSERACT} \"${image}\" \"${image}\" pdf"); if ($DEBUG) { print "\t\t\t${image} -> $cmd: $exit\n"; print "\t\t\t\t$_" for @out ; print "\t\t\t\t$_" for @err ; }; # Scale to fit pdf ($exit,$cmd, @out,@err) = exec_cmd("${CPDF} -scale-to-fit \"$pg_w[$i] $pg_h[$i]\" \"${image}\".pdf -o \"${image}\"-cpdf.pdf"); if ($DEBUG) { print "\t\t\t${image} -> $cmd: $exit\n"; print "\t\t\t\t$_" for @out ; print "\t\t\t\t$_" for @err ; }; unlink ("${tmpdir}/${pg}.pdf") if (!$DEBUG); unlink ("$image.pdf") if (!$DEBUG); move ("${tmpdir}/${pg}.pdf","${tmpdir}/${pg}.pdf.old") if ($DEBUG); unlink ("$image") if (!$DEBUG); } exit 0; } } # Wait all pages to complete while (wait () != -1) { sleep 1;}; # Check if all pages where converted. my @new_pages = ( find ( file => name => qr/pg_.*-cpdf.pdf$/i , in => ${tmpdir} )) ; if (scalar @new_pages != $pages) { print "\t\t${out_file} -> Number of output pages differ (Orig.: $pages x New: ".scalar @new_pages."): $exit\n" if ($DEBUG); syslog ("info","OCR: $in_file, number of output pages differ") if (!$DEBUG); unlink "$in_file.$host.tmp"; move ("$in_file.$host.processing", $error_file); exit (0); } # Merge resulting pdf pages to a single pdf make_path ($out_path) if ( ! -d $out_path); unlink $out_file if ( -f $out_file ); ($exit, $cmd, @out,@err) = exec_cmd("${PDFTK} \"${tmpdir}\"/pg_*-cpdf.pdf cat output \"${out_file}.tmp\" compress"); if ($DEBUG) { print "\t\t${out_file} -> $cmd: $exit\n"; print "\t\t\t$_" for @out ; print "\t\t\t$_" for @err ; }; make_path ($proc_path) if ( ! -d $proc_path); unlink $proc_file if ( -f $proc_file ); move ("$in_file.$host.processing", $proc_file); move ("${out_file}.tmp", ${out_file}); # Remove temp dir remove_tree ($tmpdir,{ error=> \my $dumb }) if (!$DEBUG); unlink $tmp_file if (!$DEBUG); unlink "$in_file.$host.tmp"; unlink "$in_file.png"; my $etime = time; print "OCR processed: $in_file OCRed (${pages} pages in ".($etime-$stime)." segs - ". sprintf ("%.2f",($etime-$stime)/$pages)." segs/page)\n" if $DEBUG; syslog ("info","OCR processed: $in_file(${pages} pages in ".($etime-$stime)." segs - ". sprintf ("%.2f",($etime-$stime)/$pages)." segs/page)") if !$DEBUG; exit (0); } sub is_ocred { my ($in_file) = @_; my @fonts = `${PDFFONTS} -l 10 \"${in_file}\" 2>/dev/null`; return ( scalar @fonts > 2 ? 1 :0 ); } sub get_pages { my ($in_file, $w, $h, $r) = @_; my $pages=0; my $i=0; my $dumb; my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFTK} \"${in_file}\" dump_data"); foreach (@lines) { chomp; ($dumb, $pages) = split / {1,}/ if ( $_ =~ /NumberOfPages:/ ); ($dumb, $i ) = split / {1,}/ if ( $_ =~ /PageMediaNumber:/ ); ($dumb, @$r[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaRotation:/ ); ($dumb, @$w[$i-1], @$h[$i-1]) = split / {1,}/ if ( $_ =~ /PageMediaDimensions:/ ); } return $pages; } sub get_imgs { my ($in_file, $page_img, $w, $h, $t) = @_; my ($dumb, $i, $page, $width, $height, $type); my ($exit, $cmd, @lines, @err) = exec_cmd("${PDFIMAGES} -list \"${in_file}\""); foreach my $line (@lines) { chomp $line; $line =~ s/^ {1,}//; if ( $line =~ /image|mask/ ) { ($page, $i , $dumb, $width, $height, $type) = split / {1,}/,$line; @$page_img[$page-1]=$i; @$w[$page-1] = $width; @$h[$page-1] = $height; @$t[$page-1] = ( $type eq "-" ? "rgb" : $type ); @$t[$page-1] = ( $type eq "icc" ? "rgb" : $type ); @$t[$page-1] = ( $type eq "index" ? "rgb" : $type ); } } return $i+1; } sub get_rotation { my ($in_file) = @_; my $rotation=0; my @lines = `${PDFTK} \"${in_file}\" dump_data 2>/dev/null`; foreach (@lines) { chomp; (my $dumb, $rotation) = split / / if ( $_ =~ /PageMediaRotation:/ ); } return $rotation; } sub get_res { my ($in_file) = @_; my $res_x=0; my $res_y=0; my @lines = `${PDFTK} \"${in_file}\" dump_data 2>/dev/null`; foreach (@lines) { chomp; (my $dumb, $res_x, $res_y) = split / / if ( $_ =~ /PageMediaDimensions:/ ); $res_x=sprintf ("%.f", $res_x); $res_y=sprintf ("%.f", $res_y); } return ($res_x,$res_y); } sub is_locked_ex { my ($path) = @_; warn "Not a plain file: '$path'" if ( $DEBUG & (! -f $path )) ; return 1 unless open my $fh, '<', $path; my $ret = not flock $fh, LOCK_SH | LOCK_NB; close $fh or warn "Cannot close '$path': $!"; return $ret; } sub child_wait { my ($pids,$time) = @_; my $count = scalar keys (%$pids); my @ended; my $found=0; while (!$found) { foreach my $pid (keys (%$pids)) { if (waitpid ($pid,WNOHANG)==-1) { $found++; push @ended, $pid; next; } } sleep 1 if (!$found); } return @ended; } sub exec_cmd { my ($cmd) = @_; my $rc; my @out=(); my @err=(); my $pid = open3(*CMD_IN, *CMD_OUT, *CMD_ERR, $cmd); $SIG{CHLD} = sub { $rc = ($? >>8); }; print CMD_IN ""; close(CMD_IN); my $selector = IO::Select->new(); $selector->add(*CMD_ERR, *CMD_OUT); while (my @ready = $selector->can_read) { foreach my $fh (@ready) { if (fileno($fh) == fileno(CMD_ERR)) { while () { push @err,$_; } } else { while () { push @out,$_; } } if (eof ($fh)) { $rc = ($? >>8) if (waitpid($pid, 0)>0); $selector->remove($fh); } } } $rc = ($? >>8) if (waitpid($pid, 0)>0); close(CMD_OUT); close(CMD_ERR); $SIG{CHLD} = 'IGNORE'; return $rc,$cmd,@out,@err; }