#!/usr/bin/perl -w
#
# Copyright (C) 2004 Joan M Vigo, imnaoj@yahoo.es
# Copyright (C) 2005,2006 Sopan Shewale, Sopan_Shewale@persistent.co.in
#                         Joan M Vigo, imnaoj@yahoo.es
#
# For licensing info read LICENSE file in the TWiki root.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details, published at 
# http://www.gnu.org/copyleft/gpl.html

# Set library paths in @INC, at compile time
BEGIN { unshift @INC, '.'; require '../../bin/setlib.cfg' }

use TWiki;

use Time::Local;

use Plucene::QueryParser;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Search::HitCollector;
use Plucene::Search::IndexSearcher;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Writer;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Document::DateSerializer;
use Plucene::SearchEngine::Index;
use Plucene::SearchEngine::Index::File;

my $debug = ! ( @ARGV && $ARGV[0] eq "-q" );

# Log stuff: opening the log file 
my $time = TWiki::Func::formatTime( time(), '$year$mo$day', 'servertime');
my $logfile = "../logs/update-".$time.".log";

if (-f $logfile) {
  eval {open (LOGFILE, ">>$logfile") or die "Unable to open Logfile : $logfile to write"; };
  if ($@) { print "ERROR: $@\n"; }
} else {
  eval {open (LOGFILE, ">$logfile") or die "Unable to open Logfile : $logfile to write"; };
  if ($@) { print "ERROR: $@\n"; }
}
# End of opening the LOG File ...

# The main subroutine  
&main();

# The job is done here
sub main()
{
  # to call TWiki::Func subroutines, we need this
  $TWiki::Plugins::SESSION = new TWiki();
  
  $debug && print "Plucene index files update\n";
  $debug && print "- to suppress all normal output: plucupdate -q\n";
  my $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' ); 
  print LOGFILE  "| $logtime | Index update started | | |\n";

  # where the index data is located
  my $idxpath = TWiki::Func::getPreferencesValue( "PLUCENEINDEXPATH" );

  # get the list of webs not to be indexed
  # NOTE if this preference is changed, you should consider to create your index again!!
  my $to_skip = TWiki::Func::getPreferencesValue( "PLUCENEINDEXSKIPWEBS" );
  my %skipwebs;
  foreach my $tmpweb ( split( /\,\s+/, $to_skip ) ) {
    $skipwebs{$tmpweb} = 1;
  }

  # the list of topics to update, by default empty
  my @topicsToUpdate = ();

  # retrieve the full list of the webs this TWiki have
  my @webList = TWiki::Func::getListOfWebs( "user" );

  # process each web
  foreach my $web (@webList) {

    # skip this web if it's in list of ones not to be indexed
    if ($skipwebs{$web}) {
      $debug && print "Skipping $web topics\n";
      $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
      print LOGFILE  "| $logtime | Skipping web | $web | |\n";
      next;
    }

    $debug && print "Checking $web ...";

    # NOTE violates store encapsulation, possible compatibility issue with future releases
    my $changes= $TWiki::Plugins::SESSION->{store}->readMetaData( $web, 'changes' );
    my $prevLastmodify = $TWiki::Plugins::SESSION->{store}->readMetaData($web,'plucupdate') || "0";
    my $currLastmodify = "";
    
    # do not process the same topic twice
    my %exclude;

    # process the web changes
    foreach( reverse split( /\n/, $changes ) ) {
      # Parse lines from .changes:
      # <topic>	<user>		<change time>	<revision>
      my ($topicName, $userName, $changeTime, $revision) = split( /\t/);
       
      if( ( ! %exclude ) || ( ! $exclude{ $topicName } ) ) {
        if( ! $currLastmodify ) {
          # newest entry
          $time = &TWiki::Func::formatTime( $prevLastmodify );
          $currLastmodify = $changeTime;
          if( $prevLastmodify eq $changeTime ) {
            # newest entry is same as at time of previous update
            $debug && print "-> no topics new/changed since $time\n";
            last;
          }
          $debug && print "-> changed topics since $time:\n";
        }
        if( $prevLastmodify >= $changeTime ) {
          # found item of last update
          last;
        }
        $exclude{ $topicName } = "1";
        $debug && print "   * $topicName\n";
        push( @topicsToUpdate, [ $web, $topicName ] );
        if ($topicName eq "WebPreferences") {
          # maybe INDEXVARIABLES contents have changed, so update of WebHome topic is required
          # NOTE again, if the VARIABLES themselves have changes, a new index should be created!
          push( @topicsToUpdate, [ $web, "WebHome" ] );
        }
      }
    }

    # NOTE violates store encapsulation, possible compatibility issue with future releases
    $TWiki::Plugins::SESSION->{store}->saveMetaData( $web, 'plucupdate', $currLastmodify );
    $debug && print "$web .plucupdate saved\n";              
  }

  if (@topicsToUpdate > 0) {

    # remove topics changed
    removeTopics($idxpath, @topicsToUpdate);

    # add topics changed
    addTopics($idxpath, @topicsToUpdate);

  } else {

    $debug && print "No index update necessary\n";
    $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
    print LOGFILE  "| $logtime | New/changed topics indexed succesfully | | |\n";

  }

  $debug && print "Updating index complete.\n";
  $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' ); 
  print LOGFILE  "| $logtime | Updating index finished | | |\n"; 
  close(LOGFILE);
}

sub removeTopics
{
  my( $idxpath, @topicsList ) = @_;

  $debug && print "Now removing old topics\n";
  my $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
  print LOGFILE  "| $logtime | Removing old topics from index | | |\n";

  my $parser = Plucene::QueryParser->new({
               analyzer => Plucene::Analysis::SimpleAnalyzer->new(),
               default  => "text" # Default field for non-specified queries
  });
  my $searcher = Plucene::Search::IndexSearcher->new($idxpath);

  my $condition;
  my %to_delete;
  foreach $topicsDefP (@topicsList) {
    my @topicsDef = @$topicsDefP;
    my ($web,$topic) = @topicsDef;

    if ( $condition ) {
       $condition .= " or ";
    }
    $condition .= "(web:$web and topic:$topic)";
    $to_delete{"$web.$topic"} = 1;
  }

  # just one search to retrieve all the docs to be removed from index
  # (individual searches resulted in "too many files open" error)
  # if the web is heavily updated, time between index updates should be reduced
  # so the query can be executed in a reasonable time ...
  my $query = $parser->parse("$condition"); 
  my @docs;
  my $hc = Plucene::Search::HitCollector->new(collect => sub {
           my ($self, $doc, $score)= @_;
           push @docs, $doc;
  });

  $searcher->search_hc($query, $hc);
  foreach $doc (@docs) {
    my $plucdoc = $searcher->doc($doc);
    my $docweb = $plucdoc->get('web')->string;
    my $doctopic = $plucdoc->get('topic')->string;
    # For partial name search of topics, just hold the first part of the string
    if($doctopic =~ m/(\w+)/) { $doctopic =~ s/$1 //; }
    $doctopic =~ s/ //g;
    if ($to_delete{"$docweb.$doctopic"}) {
       $searcher->reader->delete($doc);
    }
  }
  undef $query; undef @docs; undef $hc;

  $searcher->reader->close;    
  undef $searcher; 

  $debug && print "Removing of old topics finished\n";
  $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
  print LOGFILE  "| $logtime | Old topics removed succesfully | | |\n";
}

sub addTopics()
{
  my( $idxpath, @topicsList ) = @_;

  $debug && print "Indexing new and changed topics\n";
  my $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
  print LOGFILE  "| $logtime | Indexing new/changed topics | | |\n";

  my @attachmentList = ();

  my $analyser = Plucene::Analysis::SimpleAnalyzer->new();
  my $writer = Plucene::Index::Writer->new($idxpath, $analyser, 0);

  # get the list of attachments not to be indexed
  $to_skip = TWiki::Func::getPreferencesValue( "PLUCENEINDEXSKIPATTACHMENTS" );
  my %skipattachments;
  foreach my $tmpattachment ( split( /\,\s+/, $to_skip ) ) {
    $skipattachments{$tmpattachment} = 1;
  }

  # get attachments extension list
  my $extensions = TWiki::Func::getPreferencesValue( "PLUCENEINDEXEXTENSIONS" ) || "pdf, html, txt";
  my %indexextensions;
  foreach my $tmpextension ( split( /\,\s+/, $extensions ) ) {
    $indexextensions{$tmpextension} = 1;
  }
  $debug && print "Attachment extensions to be indexed: $extensions\n";

  # get variable names to be indexed
  my $vars2beIdxd = TWiki::Func::getPreferencesValue( "PLUCENEINDEXVARIABLES" );
  $debug && print "Variables to be indexed: $vars2beIdxd\n";

  foreach $topicsDefP (@topicsList) {
    my @topicsDef = @$topicsDefP;
    my ($web,$topic) = @topicsDef;
    $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
    print LOGFILE  "| $logtime | Reindexing topic | $web.$topic | |\n";

    my ($meta, $text) = TWiki::Func::readTopic($web, $topic, undef);

    # "TheTopic--NNNName" will return the "The Topic Name" string
    my $topicname = SplitTheTopicName($topic);
    $topicname = $topic ." ". $topicname;

    # new Plucene document for the current topic
    my $doc = Plucene::Document->new;

    # web can be used as a search criteria
    $doc->add(Plucene::Document::Field->Text("web",$web));
    # topic can be used as a search criteria
    $doc->add(Plucene::Document::Field->Text("topic",$topicname));

    # processing the topic meta info
    my ( $date, $author, $rev ) = TWiki::Func::getRevisionInfo( $web, $topic );
    $date = TWiki::Func::formatTime( $date );
    # the author can be used as a search criteria
    $doc->add(Plucene::Document::Field->Text("author","$author"));
    # version and date are stored as meta data in the doc
    # just for showing them when displaying the hits collection
    $doc->add(Plucene::Document::Field->UnIndexed("version","$rev"));
    $doc->add(Plucene::Document::Field->UnIndexed("date","$date"));

    # get all the meta info
    my $form; my @fields; my @attachments;
    if( $meta ) {
      $form = $meta->get( 'FORM' );
      if ( $form ) {
        @fields = $meta->find( 'FIELD' );
      }
      @attachments = $meta->find( 'FILEATTACHMENT' );
    }

    # processing the form meta info
    if ( $form ) {
      # the form and all of its fields can be used as a search criteria
      my $name = $form->{name};
      $doc->add(Plucene::Document::Field->UnStored("form","$name"));
      if ( @fields ) {
        foreach my $field (@fields) {
          my $name = $field->{"name"};
          my $value = $field->{"value"};
          $doc->add(Plucene::Document::Field->UnStored("$name","$value"));
        }
      }
    }

    # processing the attachments meta info
    if ( @attachments ) {
      # append any attachments to a list, which is processed later
      foreach my $attachment (@attachments) {
        my $name = $attachment->{'name'};
        # the attachment extension has to be checked
        my @bits = ( split( /\./, $name ) );
        my $extension = lc $bits[$#bits];
        # also, is the attachment is the skip list?
        if (($indexextensions{".$extension"})&&(!$skipattachments{"$web.$topic.$name"})) {
          $author = $attachment->{'user'};
          $rev = $attachment->{'version'};
          $date = TWiki::Func::formatTime( $attachment->{'date'} );
          my $comment = $attachment->{'comment'};
          push( @attachmentList, [ $web, $topic, $name, $author, $comment, $rev, $date ] );
        } else {
          $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
          print LOGFILE  "| $logtime | Skipping attachment | $web.$topic | $name |\n";
        }
      }
    }

    # variables to be indexed for web (defined in WebPreferences topic) are
    # assigned to the WebHome topic
    if (($topic eq "WebHome")&&($vars2beIdxd)) {
      $debug && print " -> assigning web variables to WebHome topic ... ";
      foreach my $var2beIdxd ( split( /\,\s+/, $vars2beIdxd ) ) {
        my $var2beIdxd_value = TWiki::Func::getPreferencesValue( "$var2beIdxd", "$web" );
        if ($var2beIdxd_value) {
          $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
          print LOGFILE  "| $logtime | Reindexing variable | $web | $var2beIdxd |\n";
          $doc->add(Plucene::Document::Field->UnStored( "$var2beIdxd", "$var2beIdxd_value"));
        }
      }
      $debug && print "done\n";
    }

    # add the document to the index
    $doc->add(Plucene::Document::Field->UnStored("text",$text));
    $writer->add_document($doc);
  }

  undef $writer;

  # now, process the attachments
  $debug && print "Reindexing attachments ...\n";

  # NOTE the file path to attachment is required
  my $pubpath = TWiki::Func::getPreferencesValue( "PLUCENEATTACHMENTSPATH" );

  # open the index to process files
  my $indexer = Plucene::SearchEngine::Index->new(
        dir => "$idxpath" 
    );

  foreach my $attachDefP (@attachmentList) {
    my @attachDef = @$attachDefP;
    my ( $web, $topic, $name, $author, $comment, $rev, $date ) = @attachDef;

    $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
    print LOGFILE  "| $logtime | Reindexing attachment | $web.$topic | $name |\n";

    # process file
    my @documents = map { $_->document } 
      Plucene::SearchEngine::Index::File->examine("$pubpath/$web/$topic/$name");
    # for the Plucene document(s) associated to the file, assign
    # some attributes, so it can be easily recovered when searching
    foreach my $doc (@documents) {
      $doc->add(Plucene::Document::Field->Text("web",$web));
      $doc->add(Plucene::Document::Field->Text("topic",$topic));
      $doc->add(Plucene::Document::Field->Text("name",$name));
      $doc->add(Plucene::Document::Field->Text("author",$author));
      $doc->add(Plucene::Document::Field->Text("comment",$comment));
      # this attribute is useful if you want to list all the attachments indexed,
      # just search using attachment:yes as search criteria
      $doc->add(Plucene::Document::Field->Keyword("attachment","yes"));
      # version and date are stored as meta data in the doc
      # just for showing them when displaying the hits collection
      $doc->add(Plucene::Document::Field->UnIndexed("version",$rev));
      $doc->add(Plucene::Document::Field->UnIndexed("date","$date"));
    }
    # add the document(s) to the index
    $indexer->index($_) for @documents;
  }
  # close the indexer
  undef $indexer;

  # just optimize the new created index
  $debug && print "Optimizing index ...\n";
  $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
  print LOGFILE  "| $logtime | Optimizing index ... | | |\n";
  $writer = Plucene::Index::Writer->new($idxpath, $analyser, 0);
  $writer->optimize;
  undef $writer;

  $debug && print "Indexing complete.\n";
  $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
  print LOGFILE  "| $logtime | New/changed topics indexed succesfully | | |\n";

  return 1;
}



## Removing the first repeated character. e.g. TTTTheTopic input will return TheTopic
sub TripFirstchar {
  my $string = shift;
  my @fields = split //, $string;
  my $firstchar = shift @fields;
  $string =~ s/^$firstchar+/$firstchar/;
  return $string;
}


##Spliting the topic Names e.g. "TheTopic" will return "The Topic"
sub SplitTopicName {
  my $string = shift;
  my @topicfields = split//, $string;
  my $flag = 1; my $seccap = 0;
  my $str = "";
  foreach (@topicfields) {
    if (/[A-Z]/){ $flag=1; } else {$flag =0;}
    if($flag == 1) { if (/[A-Z]/) { $flag=0;  if ($seccap==1){$str = $str." ";}  $str = $str . $_; $seccap=0;} }
    else {$str = $str . $_; $seccap = 1; }
  }
  my $topicname = $str;
  return $topicname;
}


##Spliting the topic Name e.g. "TheTopic--NNNName" will return the "The Topic Name" string
sub SplitTheTopicName {
  my $string = shift;
  my @fields = split //, $string;
  my $newstr = "";
  foreach (@fields) {
    if (/[A-Z]|[a-z]|[0-9]/) {$newstr = $newstr . $_;}
    else { $newstr = $newstr . " ";}
  }
  ## Now trim the more than one spaces in the string.
  $newstr =~ s/\s+/ /g;
  my @newfields = split / /, $newstr;
  my $finalstr ="";
  foreach (@newfields) {
    my $str = SplitTopicName($_);
    $finalstr = $finalstr. $str." ";
  }
  @newfields = split / /, $finalstr;
  $finalstr = "";
  foreach (@newfields) {
    my $str = TripFirstchar($_);
    $finalstr = $finalstr. $str." ";
  }
  return  $finalstr;
}	

# EOF
