#!/usr/bin/perl -w
#
# Copyright (C) 2004 Joan M Vigo, imnaoj@yahoo.es
#
# For licensing info read license.txt file in the TWiki root.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details, published at 
# http://www.gnu.org/copyleft/gpl.html

# Set library paths in @INC, at compile time
BEGIN { unshift @INC, '/home/httpd/twiki/bin'; require 'setlib.cfg'; }

use TWiki;
use Plucene::QueryParser;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Search::HitCollector;
use Plucene::Search::IndexSearcher;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Writer;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Document::DateSerializer;
use Plucene::SearchEngine::Index;
use Plucene::SearchEngine::Index::File;

my $debug = ! ( @ARGV && $ARGV[0] eq "-q" );

$|=1;

### Section of skiping a few webs, topics and attachments.
my $config = "dontindex.cfg";
my @skipwebs = ();
my @skiptopics = ();
my @skipattachments = ();
eval { open (CONFIG, $config) or die "Unable to open the config file : $config"; };
 if ($@) { print "Error: $@\n"; }

while(<CONFIG>){
     if(/^web:/)   { my ($trash, $web) = split /:/, $_; chomp $web; push@skipwebs, $web;}
     if (/(^attachment:)(.*)/) { push @skipattachments, $2 if defined $2; }
}
close(CONFIG);

## LOG File stuff
my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime( time() );
$year = sprintf( "%.4u", $year + 1900 );
my $yearmonth = sprintf( "%.4u%.2u", $year, $mon+1 );
my $logfile = "update".$yearmonth.$mday.".log";

eval {open (LOGFILE, ">$logfile") or die "Unable to open Logfile : $logfile to write"; };
if ($@) { print "ERROR: $@\n"; }
close(LOGFILE);
eval {open (LOGFILE, ">>$logfile") or die "Unable to open Logfile : $logfile to write"; };
if ($@) { print "ERROR: $@\n"; }

print LOGFILE "The index updating started at ".get_timestamp()."\n";


&main();

print LOGFILE "The index updating finished at ".get_timestamp()."\n";
close (LOGFILE);
sub main
{
    &TWiki::basicInitialize();
    &TWiki::Prefs::initializePrefs( "TWiki" );

    $debug && print "Plucene index files update\n";
    $debug && print "- to suppress all normal output: plucupdate -q\n";

    my @topicsToUpdate = ();

    my $dataDir = &TWiki::getDataDir();
    $idxpath = &TWiki::Prefs::getPreferencesValue( "PLUCENEINDEXPATH" );

    opendir( DIR, "$dataDir" ) or die "could not open $dataDir";
    @weblist = grep !/^\.\.?$/, readdir DIR;
    closedir DIR;
    WEB:foreach $webName ( @weblist ) {
                      
	      foreach my $skipweb (@skipwebs) {
		 if ($webName eq $skipweb) {print "Skiping the web : $webName\n"; next  WEB;
		                        print LOGFILE "Index updating skipped web : $webName\n";
				       }
	          }


	# Only process webs with normal names, i.e. not starting with '_'
        if( -d "$dataDir/$webName" && &TWiki::isWebName($webName) ) {

           $debug && print "Checking $webName\n";

           if( &TWiki::Store::webExists( $webName ) ) {
              
              my $changes= &TWiki::Store::readFile( "$dataDir/$webName/.changes" );

              # do not process the same topic twice
              my %exclude;
              my $prevLastmodify = &TWiki::Store::readFile( "$dataDir/$webName/.plucupdate" ) || "0";
              my $currLastmodify = "";

              foreach( reverse split( /\n/, $changes ) ) {
                 # Parse lines from .changes:
                 # <topic>	<user>		<change time>	<revision>
                 # WebHome	FredBloggs	1014591347	21

                 my ($topicName, $userName, $changeTime, $revision) = split( /\t/);

                 if( ( ! %exclude ) || ( ! $exclude{ $topicName } ) ) {

                    if( ! $currLastmodify ) {
                       # newest entry
                       $time = &TWiki::formatTime( $prevLastmodify );
                       $currLastmodify = $changeTime;
                       if( $prevLastmodify eq $changeTime ) {
                          # newest entry is same as at time of previous notification
                          $debug && print "- Note: No topics changed since $time\n";
                          last;
                       }
                       $debug && print "- Changed topics since $time:\n";
                    }

                    if( $prevLastmodify >= $changeTime ) {
                      #print "Date: found item of last notification\n";
                      # found item of last notification
                      last;
                    }

                    $exclude{ $topicName } = "1";
                    $debug && print "   * $topicName\n";

                    push( @topicsToUpdate, [ $webName, $topicName ] );

                 }
              }

              &TWiki::Store::saveFile( "$dataDir/$webName/.plucupdate", $currLastmodify );
              $debug && print "$webName .plucupdate saved\n";
              
           } else {
              print STDERR "* ERROR: Plucene index files update does not find web $webName\n";
           }
        }
    }

    if (@topicsToUpdate > 0) {
       removeTopics(@topicsToUpdate);
       addTopics(@topicsToUpdate);
    } else {
       $debug && print "No topic to reindex\n";
    }

    $debug && print "End Plucene index files update\n";
}

sub removeTopics
{
    my( @topicsList ) = @_;

    $debug && print "Removing old topics\n";

    my $parser = Plucene::QueryParser->new({
               analyzer => Plucene::Analysis::SimpleAnalyzer->new(),
               default  => "text" # Default field for non-specified queries
    });

    my $searcher = Plucene::Search::IndexSearcher->new($idxpath);

    my $condition;

    foreach $topicsDefP (@topicsList) {

       my @topicsDef = @$topicsDefP;
       my ($web,$topic) = @topicsDef;

       if ( $condition ) {
         $condition .= " or ";
       }
       $condition .= "(web:$web and topic:$topic)";

    }

    # just one search to retrieve all the docs to be removed from index
    # (individual searches resulted in "too many files open" error)
    my $query = $parser->parse("$condition"); 
    my @docs;
    my $hc = Plucene::Search::HitCollector->new(collect => sub {
           my ($self, $doc, $score)= @_;
           push @docs, $doc;
    });

    $searcher->search_hc($query, $hc);
    foreach $doc (@docs) {
       $searcher->reader->delete($doc);
    }
    undef $query; undef @docs; undef $hc;

    $searcher->reader->close;    
    undef $searcher; 

    $debug && print "Removing of old topics finished\n";

}

sub addTopics()
{
    my( @topicsList ) = @_;

    $debug && print "Indexing new topics\n";

    my @attachmentList = ();

    my $analyser = Plucene::Analysis::SimpleAnalyzer->new();
    my $writer = Plucene::Index::Writer->new($idxpath, $analyser, 0);

    # get attachments extension list
    &TWiki::Prefs::initializePrefs( "TWiki" );
    #my $extensions = &TWiki::Prefs::getPreferencesValue( "PLUCENEINDEXEXTENSIONS" ) || ".pdf,.html,.txt,.doc,.xls,.ppt,";
    my $extensions = ".pdf,.html,.txt,.doc,.xls,.ppt,";
    $debug && print "Attachment extensions to be indexed: $extensions\n";

    foreach $topicsDefP (@topicsList) {
       my @topicsDef = @$topicsDefP;
       my ($web,$topic) = @topicsDef;
       my ($meta, $text) = &TWiki::Store::readTopic($web, $topic, 1);


my $topicname = SplitTheTopicName($topic);
   $topicname = $topic ." ". $topicname;


       my $doc = Plucene::Document->new;
       $doc->add(Plucene::Document::Field->Text("web",$web));
       $doc->add(Plucene::Document::Field->Text("topic",$topicname));
       my %topicinfo = (); my %form; my @fields; my @attachments;
       if( $meta ) {
          %topicinfo = $meta->findOne( "TOPICINFO" );
          %form = $meta->findOne( "FORM" );
          if ( %form ) {
            @fields = $meta->find( "FIELD" );
          }
          @attachments = $meta->find( "FILEATTACHMENT" );
       }
       my( $date, $author, $rev );
       if( %topicinfo ) {
          # Stored as meta data in topic for faster access
          $date = TWiki::formatTime( $topicinfo{"date"} ); # FIXME deal with changeToIsoDate
          $author = $topicinfo{"author"};
          my $tmp = $topicinfo{"version"};
          $tmp =~ /1\.(.*)/o;
          $rev = $1;
          $doc->add(Plucene::Document::Field->Text("author","$author"));
          $doc->add(Plucene::Document::Field->UnIndexed("version","$rev"));
          $doc->add(Plucene::Document::Field->UnIndexed("date","$date"));
       }
       if ( %form ) {
          $doc->add(Plucene::Document::Field->UnStored("form","$form{'name'}"));
          if ( @fields ) {
             foreach my $field (@fields) {
                my $name = $field->{"name"};
                my $value = $field->{"value"};
                $doc->add(Plucene::Document::Field->UnStored("$name","$value"));
             }
          }
       }
       if ( @attachments ) {
          foreach my $attachment (@attachments) {
             my $name = $attachment->{'name'};
             my $extension = substr($name, length($name)-4, 4);
             if ( $extensions =~ m/$extension/ ) {
                $author = $attachment->{'user'};
                $rev = $attachment->{'version'};
                $date = TWiki::formatTime( $attachment->{'date'} ); # FIXME deal with changeToIsoDate
                push( @attachmentList, [ $web, $topic, $name, $author, $rev, $date ] );
             }
          }
       }
       $doc->add(Plucene::Document::Field->UnStored("text",$text));
       $writer->add_document($doc);
    }

    undef $writer;

    my $pubpath = TWiki::getPubDir();
    my $indexer = Plucene::SearchEngine::Index->new(
          dir => "$idxpath" 
      );
      ATTACHMENT: foreach my $attachDefP (@attachmentList) {
      my @attachDef = @$attachDefP;
      my( $web, $topic, $name, $author, $rev, $date ) = @attachDef;

       print "Indexing attachment Web: $web  Topic: $topic  Name : $name \n";
      my $attachment_string = $web.":".$topic.":".$name;
       foreach my $attach (@skipattachments) {
             if ($attachment_string eq $attach) {
               print LOGFILE "Skiping the attachment $attach\n";
                next ATTACHMENT;
          }
        }
     print LOGFILE "Updating index of attachment : $web:$topic:$name\n";


      my @documents = map { $_->document } 
        Plucene::SearchEngine::Index::File->examine("$pubpath/$web/$topic/$name");
      foreach my $doc (@documents) {
        $doc->add(Plucene::Document::Field->Text("web",$web));
        $doc->add(Plucene::Document::Field->Text("topic",$topic));
        $doc->add(Plucene::Document::Field->Keyword("attachment","yes"));
        $doc->add(Plucene::Document::Field->Text("name",$name));
        $doc->add(Plucene::Document::Field->Text("author",$author));
        $doc->add(Plucene::Document::Field->UnIndexed("version",$rev));
        $doc->add(Plucene::Document::Field->UnIndexed("date","$date"));
      }
      $indexer->index($_) for @documents;
    }
    undef $indexer;

     $writer = Plucene::Index::Writer->new($idxpath, $analyser, 0);
    $writer->optimize;
    undef $writer;

    $debug && print "Indexing of new topics finished\n";

    return 1;
}

## Removing the first repeated character. e.g. TTTTheTopic input will return TheTopic
sub TripFirstchar {
    my $string = shift;
    my @fields = split //, $string;
    my $firstchar = shift @fields;
    $string =~ s/^$firstchar+/$firstchar/;
    return $string;
}



##Spliting the topic Names e.g. "TheTopic" will return "The Topic"
sub SplitTopicName {
     my $string = shift;
my @topicfields = split//, $string;
my $flag = 1; my $seccap = 0;
my $str = "";
foreach (@topicfields) {
     if (/[A-Z]/){ $flag=1; } else {$flag =0;}

      if($flag == 1) { if (/[A-Z]/) { $flag=0;  if ($seccap==1){$str = $str." ";}  $str = $str . $_; $seccap=0;} } else {$str = $str . $_; $seccap = 1; }
}


my $topicname = $str;
return $topicname;
}

##Spliting the topic Name e.g. "TheTopic--NNNName" will return the "The Topic Name" string
sub SplitTheTopicName {

my $string = shift;

my @fields = split //, $string;
my $newstr = "";
foreach (@fields) {
          if (/[A-Z]|[a-z]|[0-9]/) {$newstr = $newstr . $_;}
          else { $newstr = $newstr . " ";}
     }

## Now trim the more than one spaces in the string.
$newstr =~ s/\s+/ /g;

my @newfields = split / /, $newstr;

my $finalstr ="";
foreach (@newfields) {
              my $str = SplitTopicName($_);
                $finalstr = $finalstr. $str." ";
          }


 @newfields = split / /, $finalstr;
$finalstr = "";

foreach (@newfields) {
          my $str = TripFirstchar($_);
                $finalstr = $finalstr. $str." ";
          }

 return  $finalstr;

}


### Get the timestamp. This is use for loggin the event in log file..
sub get_timestamp {
  my ($sec, $min, $hr, $day, $month, $year, $day_of_week, $day_of_year, $dst) = localtime(time);
  my $timestamp = sprintf("%02s-%02s-%04s %02s:%02s:%02s",$day, $month + 1, $year + 1900, $hr, $min, $sec);
  return $timestamp;
}




# EOF
