# For licensing info read LICENSE file in the TWiki root. # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details, published at # http://www.gnu.org/copyleft/gpl.html package TWiki::Contrib::SearchEngineKinoSearchAddOn::StringifyPlugins::Tika; use base 'TWiki::Contrib::SearchEngineKinoSearchAddOn::StringifyBase'; use TWiki::Contrib::SearchEngineKinoSearchAddOn::Stringifier; use File::Temp qw/tmpnam/; # use full pathname if tika2txt not in cron's path # Only if program exists, I register myself for various file types. if (__PACKAGE__->_programExists("tika2txt")){ __PACKAGE__->register_handler("application/word", ".doc"); __PACKAGE__->register_handler("text/docx", ".docx"); __PACKAGE__->register_handler("text/html", ".html"); __PACKAGE__->register_handler("application/pdf", ".pdf"); __PACKAGE__->register_handler("text/ppt", ".ppt"); __PACKAGE__->register_handler("text/pptx", ".pptx"); __PACKAGE__->register_handler("application/excel", ".xls"); __PACKAGE__->register_handler("application/vnd.openxmlformats-officedocument.spreadsheet.sheet", ".xlsx"); } sub stringForFile { my ($self, $filename) = @_; my $tmp_file = tmpnam(); # use full path - cron may not have /usr/local/bin in path my $cmd = "/usr/local/bin/tika2txt '$filename' > $tmp_file 2>/dev/null"; return "" unless ((system($cmd) == 0) && (-f $tmp_file)); # use the text stringifier to rework encoding (from utf to iso-8859-15) # tika2txt handles this, so don't repeat # my $text = TWiki::Contrib::SearchEngineKinoSearchAddOn::Stringifier->stringFor($tmp_file); my $in; open $in, $tmp_file; my $text = join(" ", <$in>); close($in); unlink($tmp_file); # remove all numbers # $text =~ s/(? '127.0.0.1', PeerPort => '20000', Proto => 'tcp') or die "Couldn't connect to Tika server\n"; $data = <$in>; print $socket $data; $text = <$socket>; close($socket); return $text; } 1;