#!/usr/bin/perl -w
#
# Copyright (C) 2004 Joan M Vigo, imnaoj@yahoo.es
#
# For licensing info read license.txt file in the TWiki root.
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details, published at 
# http://www.gnu.org/copyleft/gpl.html

# Set library paths in @INC, at compile time
#BEGIN { unshift @INC, '../bin-new'; require 'setlib.cfg' }
BEGIN { unshift @INC, '/home/httpd/twiki/bin'; require 'setlib.cfg' }
                                                                                
use TWiki;
use Time::Local;
use Plucene::Document;
use Plucene::Document::Field;
use Plucene::Index::Writer;
use Plucene::Analysis::SimpleAnalyzer;
use Plucene::Document::DateSerializer;
use Plucene::SearchEngine::Index;
use Plucene::SearchEngine::Index::File;

use strict;

my $debug = ! ( @ARGV && $ARGV[0] eq "-q" );
my $idxpath;

$|=1;

### Section of skiping a few webs, topics and attachments.
my $config = "dontindex.cfg";
my @skipwebs = ();
my @skiptopics = ();
my @skipattachments = ();
eval { open (CONFIG, $config) or die "Unable to open the config file : $config";
 };

if ($@) { print "Error: $@\n"; }

while(<CONFIG>){
    if(/^web:/)   { my ($trash, $web) = split /:/, $_; chomp $web; push@skipwebs, $web;}
    if (/(^attachment:)(.*)/) { push @skipattachments, $2 if defined $2; }
}
close(CONFIG);

### LOG File stuff
my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime( time() );
$year = sprintf( "%.4u", $year + 1900 );
my $yearmonth = sprintf( "%.4u%.2u", $year, $mon+1 );
my $logfile = $yearmonth.$mday.".log";

eval {open (LOGFILE, ">$logfile") or die "Unable to open Logfile : $logfile to write"; };
if ($@) { print "ERROR: $@\n"; }
close(LOGFILE);
eval {open (LOGFILE, ">>$logfile") or die "Unable to open Logfile : $logfile to write"; };
if ($@) { print "ERROR: $@\n"; }

print LOGFILE "The indexing started at ".get_timestamp()."\n";

&main();

sub main() {

  &TWiki::basicInitialize();
  &TWiki::Prefs::initializePrefs( "TWiki" );

  $debug && print "Plucene index files init\n";
  $debug && print "- to suppress all normal output: plucindex -q\n";

  my @webList = ();
  my @attachmentList = ();

  my $dtpath = &TWiki::getDataDir();
  $idxpath = &TWiki::Prefs::getPreferencesValue( "PLUCENEINDEXPATH" );

  # retrieve web list
  opendir DIR, $dtpath;
  my @tmpList = readdir(DIR);
  closedir(DIR);
  @tmpList = sort
      grep { s#^.+/([^/]+)$#$1# }
      grep { -d }
      map  { "$dtpath/$_" }
      grep { ! /^[._]/ } @tmpList;
  foreach my $aweb ( @tmpList ) {
      push( @webList, $aweb ) unless( grep { /^$aweb$/ } @webList );
  }

  my $analyser = Plucene::Analysis::SimpleAnalyzer->new();
  my $writer = Plucene::Index::Writer->new($idxpath, $analyser, 1);

  # get attachments extension list
  #my $extensions = &TWiki::Prefs::getPreferencesValue( "PLUCENEINDEXEXTENSIONS" ) ||
  my $extensions = ".pdf,.xls,.ppt,.html,.txt,.xls,";
  $debug && print "Attachment extensions to be indexed: $extensions\n";




  # process each web
WEB: foreach my $web (@webList) {
         
    ### Skip a few webs...
       foreach my $skipweb (@skipwebs) {  
        if ($web eq $skipweb) {
             $debug && print "Skiping the web : $web\n"; 
             print LOGFILE "Indexing skipped web : $web\n";
             next  WEB;  
            }
          }
    
    $debug && print "Indexing the web: $web\n";
        print LOGFILE "Indexing web : $web\n";

    &TWiki::Store::saveFile( "$dtpath/$web/.plucupdate", time() );

    my @topics = &TWiki::Store::getTopicNames( $web );

    foreach my $topic (@topics) {
   $debug && print "Indexing the topic: $topic\n"; 
   print LOGFILE "Indexing web : $web topic: $topic\n";
      my ($meta, $text) = &TWiki::Store::readTopic($web, $topic, 1);
      
      my $topicname = SplitTheTopicName($topic);
         $topicname = $topic ." ". $topicname;

   	 my $doc = Plucene::Document->new;
      $doc->add(Plucene::Document::Field->Text("web",$web));
      $doc->add(Plucene::Document::Field->Text("topic",$topicname));
      my %topicinfo = (); my %form; my @fields; my @attachments;
      if( $meta ) {
        %topicinfo = $meta->findOne( "TOPICINFO" );
        %form = $meta->findOne( "FORM" );
        if ( %form ) {
          @fields = $meta->find( "FIELD" );
        }
        @attachments = $meta->find( "FILEATTACHMENT" );
      }
      my( $date, $author, $rev );
      if( %topicinfo ) {
        # Stored as meta data in topic for faster access
        $date = TWiki::formatTime( $topicinfo{"date"} ); # FIXME deal with changeToIsoDate
        $author = $topicinfo{"author"};
        my $tmp = $topicinfo{"version"};
        $tmp =~ /1\.(.*)/o;
        $rev = $1;
        $doc->add(Plucene::Document::Field->Text("author","$author"));
        $doc->add(Plucene::Document::Field->UnIndexed("version","$rev"));
        $doc->add(Plucene::Document::Field->UnIndexed("date","$date"));
      }
      if ( %form ) {
        $doc->add(Plucene::Document::Field->UnStored("form","$form{'name'}"));
        if ( @fields ) {
          foreach my $field (@fields) {
            my $name = $field->{"name"};
            my $value = $field->{"value"};
            $doc->add(Plucene::Document::Field->UnStored("$name","$value"));
          }
        }
      }
      if ( @attachments ) {
        # append any attachments to a list, which is processed later
        foreach my $attachment (@attachments) {
          my $name = $attachment->{'name'};
          my $extension = substr($name, length($name)-4, 4);
          if ( $extensions =~ m/$extension/ ) {
            $author = $attachment->{'user'};
            $rev = $attachment->{'version'};
            $date = TWiki::formatTime( $attachment->{'date'} ); # FIXME deal with changeToIsoDate
            push( @attachmentList, [ $web, $topic, $name, $author, $rev, $date ] );
          }
        }
      }
      $doc->add(Plucene::Document::Field->UnStored("text",$text));
      $writer->add_document($doc);
    }
  }

  undef $writer;

  # process all the attachments
  $debug && print "Indexing attachments ...\n";

  my $pubpath = &TWiki::getPubDir();

  my $indexer = Plucene::SearchEngine::Index->new(
        dir => "$idxpath" 
    );

ATTACHMENT:  foreach my $attachDefP (@attachmentList) {
    my @attachDef = @$attachDefP;
    my( $web, $topic, $name, $author, $rev, $date ) = @attachDef;
     my $attachment_string = $web.":".$topic.":".$name;
     foreach my $attach (@skipattachments) {
          if ($attachment_string eq $attach) { 
          print LOGFILE "Skiping the attachment $attach\n"; 
          next ATTACHMENT; 
         }
      } 
     $debug && print "Indexing attachment :$web:$topic:$name\n";
     print LOGFILE "Indexing attachment : $web:$topic:$name\n";
    my @documents = map { $_->document } 
      Plucene::SearchEngine::Index::File->examine("$pubpath/$web/$topic/$name");
    foreach my $doc (@documents) {
      $doc->add(Plucene::Document::Field->Text("web",$web));
      $doc->add(Plucene::Document::Field->Text("topic",$topic));
      $doc->add(Plucene::Document::Field->Keyword("attachment","yes"));
      $doc->add(Plucene::Document::Field->Text("name",$name));
      $doc->add(Plucene::Document::Field->Text("author",$author));
      $doc->add(Plucene::Document::Field->UnIndexed("version",$rev));
      $doc->add(Plucene::Document::Field->UnIndexed("date","$date"));
    }
    $indexer->index($_) for @documents;
  }
  undef $indexer;

  $writer = Plucene::Index::Writer->new($idxpath, $analyser, 0);
  $writer->optimize;
  undef $writer;

  $debug && print "Indexing complete.\n";
}

print LOGFILE "Indexing finished at ".get_timestamp()."\n";
close(LOGFILE);

## Removing the first repeated character. e.g. TTTTheTopic input will return TheTopic
sub TripFirstchar {
    my $string = shift;
    my @fields = split //, $string;
    my $firstchar = shift @fields;
    $string =~ s/^$firstchar+/$firstchar/;
    return $string;
}


##Spliting the topic Names e.g. "TheTopic" will return "The Topic"
sub SplitTopicName {
     my $string = shift;
my @topicfields = split//, $string;
my $flag = 1; my $seccap = 0;
my $str = "";
foreach (@topicfields) {
     if (/[A-Z]/){ $flag=1; } else {$flag =0;}

   if($flag == 1) { if (/[A-Z]/) { $flag=0;  if ($seccap==1){$str = $str." ";}  $str = $str . $_; $seccap=0;} }
      else {$str = $str . $_; $seccap = 1; }
}


my $topicname = $str;
return $topicname;
}

##Spliting the topic Name e.g. "TheTopic--NNNName" will return the "The Topic Name" string
sub SplitTheTopicName {

my $string = shift;

my @fields = split //, $string;
my $newstr = "";
foreach (@fields) {
          if (/[A-Z]|[a-z]|[0-9]/) {$newstr = $newstr . $_;}
          else { $newstr = $newstr . " ";}
     }

## Now trim the more than one spaces in the string.
$newstr =~ s/\s+/ /g;

my @newfields = split / /, $newstr;

my $finalstr ="";
foreach (@newfields) {
              my $str = SplitTopicName($_);
                $finalstr = $finalstr. $str." ";
          }


 @newfields = split / /, $finalstr;
$finalstr = "";

foreach (@newfields) {
          my $str = TripFirstchar($_);
                $finalstr = $finalstr. $str." ";
          }

 return  $finalstr;

}

### Get the timestamp. This is use for loggin the event in log file..
sub get_timestamp {
  my ($sec, $min, $hr, $day, $month, $year, $day_of_week, $day_of_year, $dst) = localtime(time);
  my $timestamp = sprintf("%02s-%02s-%04s %02s:%02s:%02s",$day, $month + 1, $year + 1900, $hr, $min, $sec);
    return $timestamp;
}


# EOF
