#!/usr/bin/perl -w
#
# Copyright (C) 2004 Joan M Vigo, imnaoj@yahoo.es
# Copyright (C) 2005,2006 Sopan Shewale, Sopan_Shewale@persistent.co.in
#                         Joan M Vigo, imnaoj@yahoo.es
#
# For licensing info read LICENSE file in the TWiki root.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details, published at 
# http://www.gnu.org/copyleft/gpl.html

# Set library paths in @INC, at compile time
BEGIN { unshift @INC, '.'; require '../../bin/setlib.cfg' } 
                                                                                
use TWiki;

use Time::Local;

use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Writer;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Document::DateSerializer;

use Plucene::SearchEngine::Index;
use Plucene::SearchEngine::Index::File;

use strict;

my $debug = ! ( @ARGV && $ARGV[0] eq "-q" );

# Log stuff: opening the log file 
my $time = TWiki::Func::formatTime( time(), '$year$mo$day', 'servertime');
my $logfile = "../logs/index-".$time.".log";

if (-f $logfile) {
  eval {open (LOGFILE, ">>$logfile") or die "Unable to open Logfile : $logfile to write"; };
  if ($@) { print "ERROR: $@\n"; }
} else {
  eval {open (LOGFILE, ">$logfile") or die "Unable to open Logfile : $logfile to write"; };
  if ($@) { print "ERROR: $@\n"; }
}
# End of opening the LOG File ...

# The main subroutine  
&main();

# The job is done here
sub main() {

  # to call TWiki::Func subroutines, we need this
  $TWiki::Plugins::SESSION = new TWiki();
  
  $debug && print "Plucene index files init\n";
  $debug && print "- to suppress all normal output: plucindex -q\n";
  my $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' ); 
  print LOGFILE  "| $logtime | Indexing started | | |\n";

  # where the index data is located
  my $idxpath = TWiki::Func::getPreferencesValue( "PLUCENEINDEXPATH" );

  # get the list of webs not to be indexed
  my $to_skip = TWiki::Func::getPreferencesValue( "PLUCENEINDEXSKIPWEBS" );
  my %skipwebs;
  foreach my $tmpweb ( split( /\,\s+/, $to_skip ) ) {
    $skipwebs{$tmpweb} = 1;
  }

  # get the list of attachments not to be indexed
  $to_skip = TWiki::Func::getPreferencesValue( "PLUCENEINDEXSKIPATTACHMENTS" );
  my %skipattachments;
  foreach my $tmpattachment ( split( /\,\s+/, $to_skip ) ) {
    $skipattachments{$tmpattachment} = 1;
  }

  # get attachments extension list
  my $extensions = TWiki::Func::getPreferencesValue( "PLUCENEINDEXEXTENSIONS" ) || "pdf, html, txt";
  my %indexextensions;
  foreach my $tmpextension ( split( /\,\s+/, $extensions ) ) {
    $indexextensions{$tmpextension} = 1;
  }
  $debug && print "Attachment extensions to be indexed: $extensions\n";

  # get variable names to be indexed
  my $vars2beIdxd = TWiki::Func::getPreferencesValue( "PLUCENEINDEXVARIABLES" );
  $debug && print "Variables to be indexed: $vars2beIdxd\n";

  # all the attachments will be added here
  # indexing of attachments is the last thing to be done
  my @attachmentList = ();

  # init the Plucene analyzer & indexer
  my $analyser = Plucene::Analysis::SimpleAnalyzer->new();
  my $writer = Plucene::Index::Writer->new($idxpath, $analyser, 1);

  # retrieve the full list of the webs this TWiki have
  my @webList = TWiki::Func::getListOfWebs( "user" );

  # process each web
  foreach my $web (@webList) {

    # skip this web if it's in list of ones not to be indexed
    if ($skipwebs{$web}) {
       $debug && print "Skipping $web topics\n";
       $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
       print LOGFILE  "| $logtime | Skipping web | $web | |\n";
       next;
    }

    $debug && print "Indexing $web topics\n";
    $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' ); 
    print LOGFILE  "| $logtime | Indexing web | $web | |\n";

    # NOTE violates store encapsulation, possible compatibility issue with future releases
    $TWiki::Plugins::SESSION->{store}->saveMetaData( $web, 'plucupdate', time() );

    # get the list of topics
    my @topics = TWiki::Func::getTopicList( $web );

    foreach my $topic (@topics) {

      $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
      print LOGFILE  "| $logtime | Indexing topic | $web.$topic | |\n";

      my ($meta, $text) = TWiki::Func::readTopic($web, $topic, undef);

      # "TheTopic--NNNName" will return the "The Topic Name" string
      my $topicname = SplitTheTopicName($topic);
      $topicname = $topic ." ". $topicname;

      # new Plucene document for the current topic
      my $doc = Plucene::Document->new;

      # web can be used as a search criteria
      $doc->add(Plucene::Document::Field->Text("web",$web));
      # topic can be used as a search criteria
      $doc->add(Plucene::Document::Field->Text("topic",$topicname));

      # processing the topic meta info
      my ( $date, $author, $rev ) = TWiki::Func::getRevisionInfo( $web, $topic );
      $date = TWiki::Func::formatTime( $date );
      # the author can be used as a search criteria
      $doc->add(Plucene::Document::Field->Text("author","$author"));
      # version and date are stored as meta data in the doc
      # just for showing them when displaying the hits collection
      $doc->add(Plucene::Document::Field->UnIndexed("version","$rev"));
      $doc->add(Plucene::Document::Field->UnIndexed("date","$date"));

      # get all the meta info
      my $form; my @fields; my @attachments;
      if( $meta ) {
        $form = $meta->get( 'FORM' );
        if ( $form ) {
          @fields = $meta->find( 'FIELD' );
        }
        @attachments = $meta->find( 'FILEATTACHMENT' );
      }

      # processing the form meta info
      if ( $form ) {
        # the form and all of its fields can be used as a search criteria
        my $name = $form->{name};
        $doc->add(Plucene::Document::Field->UnStored("form","$name"));
        if ( @fields ) {
          foreach my $field (@fields) {
            my $name = $field->{"name"};
            my $value = $field->{"value"};
            $doc->add(Plucene::Document::Field->UnStored("$name","$value"));
          }
        }
      }

      # processing the attachments meta info
      if ( @attachments ) {
        # append any attachments to a list, which is processed later
        foreach my $attachment (@attachments) {
          my $name = $attachment->{'name'};
          # the attachment extension has to be checked
          my @bits = ( split( /\./, $name ) );
          my $extension = lc $bits[$#bits];
          # also, is the attachment is the skip list?
          if (($indexextensions{".$extension"})&&(!$skipattachments{"$web.$topic.$name"})) {
            $author = $attachment->{'user'};
            $rev = $attachment->{'version'};
            $date = TWiki::Func::formatTime( $attachment->{'date'} );
            my $comment = $attachment->{'comment'};
            push( @attachmentList, [ $web, $topic, $name, $author, $comment, $rev, $date ] );
          } else {
            $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
            print LOGFILE  "| $logtime | Skipping attachment | $web.$topic | $name |\n";
          }
        }
      }

      # variables to be indexed for web (defined in WebPreferences topic) are
      # assigned to the WebHome topic
      if (($topic eq "WebHome")&&($vars2beIdxd)) {
        $debug && print " -> assigning web variables to WebHome topic ... ";
        foreach my $var2beIdxd ( split( /\,\s+/, $vars2beIdxd ) ) {
          my $var2beIdxd_value = TWiki::Func::getPreferencesValue( "$var2beIdxd", "$web" );
          if ($var2beIdxd_value) {
            $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
            print LOGFILE  "| $logtime | Indexing variable | $web | $var2beIdxd |\n";
            $doc->add(Plucene::Document::Field->UnStored( "$var2beIdxd", "$var2beIdxd_value"));
          }
        }
        $debug && print "done\n";
      }

      # add the document to the index
      $doc->add(Plucene::Document::Field->UnStored("text",$text));
      $writer->add_document($doc);
    }
  }

  # close the index data
  undef $writer;

  # now, process the attachments
  $debug && print "Indexing attachments ...\n";

  # NOTE the file path to attachment is required
  my $pubpath = TWiki::Func::getPreferencesValue( "PLUCENEATTACHMENTSPATH" );

  # open the index to process files
  my $indexer = Plucene::SearchEngine::Index->new(
        dir => "$idxpath" 
    );

  foreach my $attachDefP (@attachmentList) {
    my @attachDef = @$attachDefP;
    my ( $web, $topic, $name, $author, $comment, $rev, $date ) = @attachDef;

    $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
    print LOGFILE  "| $logtime | Indexing attachment | $web.$topic | $name |\n";

    # process file
    my @documents = map { $_->document } 
      Plucene::SearchEngine::Index::File->examine("$pubpath/$web/$topic/$name");
    # for the Plucene document(s) associated to the file, assign
    # some attributes, so it can be easily recovered when searching
    foreach my $doc (@documents) {
      $doc->add(Plucene::Document::Field->Text("web",$web));
      $doc->add(Plucene::Document::Field->Text("topic",$topic));
      $doc->add(Plucene::Document::Field->Text("name",$name));
      $doc->add(Plucene::Document::Field->Text("author",$author));
      $doc->add(Plucene::Document::Field->Text("comment",$comment));
      # this attribute is useful if you want to list all the attachments indexed,
      # just search using attachment:yes as search criteria
      $doc->add(Plucene::Document::Field->Keyword("attachment","yes"));
      # version and date are stored as meta data in the doc
      # just for showing them when displaying the hits collection
      $doc->add(Plucene::Document::Field->UnIndexed("version",$rev));
      $doc->add(Plucene::Document::Field->UnIndexed("date","$date"));
    }
    # add the document(s) to the index
    $indexer->index($_) for @documents;
  }
  # close the indexer
  undef $indexer;

  # just optimize the new created index
  $debug && print "Optimizing index ...\n";
  $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' );
  print LOGFILE  "| $logtime | Optimizing index ... | | |\n";
  $writer = Plucene::Index::Writer->new($idxpath, $analyser, 0);
  $writer->optimize;
  undef $writer;

  $debug && print "Indexing complete.\n";
  $logtime = TWiki::Func::formatTime( time(), '$rcs', 'servertime' ); 
  print LOGFILE  "| $logtime | Indexing finished | | |\n"; 
  close(LOGFILE);
}


## Removing the first repeated character. e.g. TTTTheTopic input will return TheTopic
sub TripFirstchar {
  my $string = shift;
  my @fields = split //, $string;
  my $firstchar = shift @fields;
  $string =~ s/^$firstchar+/$firstchar/;
  return $string;
}


##Spliting the topic Names e.g. "TheTopic" will return "The Topic"
sub SplitTopicName {
  my $string = shift;
  my @topicfields = split//, $string;
  my $flag = 1; my $seccap = 0;
  my $str = "";
  foreach (@topicfields) {
    if (/[A-Z]/){ $flag=1; } else {$flag =0;}
    if($flag == 1) { if (/[A-Z]/) { $flag=0;  if ($seccap==1){$str = $str." ";}  $str = $str . $_; $seccap=0;} }
    else {$str = $str . $_; $seccap = 1; }
  }
  my $topicname = $str;
  return $topicname;
}


##Spliting the topic Name e.g. "TheTopic--NNNName" will return the "The Topic Name" string
sub SplitTheTopicName {
  my $string = shift;
  my @fields = split //, $string;
  my $newstr = "";
  foreach (@fields) {
    if (/[A-Z]|[a-z]|[0-9]/) {$newstr = $newstr . $_;}
    else { $newstr = $newstr . " ";}
  }
  ## Now trim the more than one spaces in the string.
  $newstr =~ s/\s+/ /g;
  my @newfields = split / /, $newstr;
  my $finalstr ="";
  foreach (@newfields) {
    my $str = SplitTopicName($_);
    $finalstr = $finalstr. $str." ";
  }
  @newfields = split / /, $finalstr;
  $finalstr = "";
  foreach (@newfields) {
    my $str = TripFirstchar($_);
    $finalstr = $finalstr. $str." ";
  }
  return  $finalstr;
}	

# EOF
