#!/usr/bin/env perl use strict; my $webDataDirectory = "/nfs/pdx/disks/nhm.arch.futwiki.1/twiki/data"; my $recursive_option = 0; for(my $i=0; $i <= $#ARGV; $i++) { #print "argv=$ARGV[$i]\n"; if( 0 ) { } elsif( $ARGV[$i] eq "-directory" ) { $webDataDirectory = $ARGV[++$i]; } elsif( $ARGV[$i] eq "-r" || $ARGV[$i] eq "-recurse" ) { $recursive_option = 1; } else { die "unrecognized argument $ARGV[$i]"; } } my %files_seen; my %seen; my %seenIn; scan_files_in_directory($webDataDirectory); check_for_wiki_file_existence( $webDataDirectory ); my @files_that_are_not_referenced; my $FILES_THAT_ARE_NOT_REFERENCED; check_for_files_that_are_not_referenced(); print_report(); exit(0); sub is_subdirectory { my($dir,$file) = @_; if( $file eq "." ) { return 0; } if( $file eq ".." ) { return 0; } if( -d "$dir/$file" ) { return 1; } # TBD symlinks return 0; } sub ignore_subdirectory { my $file = shift; if( $file eq "CVS" ) { return 1; } if( $file =~ m|.*/CVS$| ) { return 1; } return 0; } sub scan_files_in_directory { my $webDataLocation = shift; opendir(WEBDIR, $webDataLocation); my @subdirs = (); while(my $file=readdir(WEBDIR)) { if( is_subdirectory($webDataLocation,$file) && ! ignore_subdirectory("$webDataLocation/$file") ) { # TBD: maybe just use Perl/CPAN's Find package if( $recursive_option ) { push(@subdirs,"$webDataLocation/$file"); } next; } scan_wiki_file($webDataLocation,$file); } # Handle subdirs # TBD: maybe better to just use Perl/CPAN Find foreach my $subdir ( @subdirs ) { #print "recursing into subdirectory $subdir\n"; scan_files_in_directory($subdir); } } BEGIN { push @INC, "."; } use My_TWiki_Renderer; sub scan_wiki_file { my($webDataLocation,$file) = @_; die "Can't find directory $webDataLocation" if ! -d $webDataLocation; die "Can't find directory $webDataLocation/$file" if ! -r "$webDataLocation/$file"; return unless ($file =~ /\.txt$/); # TBD: set ignore list on command line my $simplefile = $file; $simplefile =~ s/\.txt$//; $files_seen{$simplefile}++; my $slurp; open(IN, "$webDataLocation/$file"); while( my $line = ) { $slurp .= $line; } close IN; # TBD: I need to know where the local wiki is. # Currently hardwiring twiki's path; # this will break if I have a directory of twiki-like files # that does not use twiki's filestructure. # TBD: parameterize somehow? my $webInWiki = $webDataLocation; $webInWiki =~ s{.*/data/}{}; my $renderer = new My_TWiki_Renderer; $renderer->render_text_extracting_links($slurp,$webInWiki,$simplefile); #print "scan_wiki_file: ", join(' ',$renderer->{wiki_link_list}), "\n"; foreach my $link ( $renderer->{wiki_link_list}->sorted_list_of_web_paths_eliding_current_web_path() ) { $seen{$link}++; $seenIn{$link}{$file}++; } } my $EXISTS; my $NOTEXISTS; sub wiki_file_exists { my ($webDir,$topicName) = @_; # simple case - topicName has no web component # i.e. is in local web if( !( $topicName =~ m/.*\..*/ ) ) { return -e "$webDir/$topicName.txt"; } # fancier case - topicname has web compnent. # infer root of twiki data tree # since all TWiki paths are strictly local or absolute my $twikiDataDir = $webDir; $twikiDataDir =~ s{(.*/data/).*}{$1}; # convert topicName to path relative to twikiDataDir $topicName =~ s{\.}{/}g; return -e "$twikiDataDir/$topicName.txt"; } sub check_for_wiki_file_existence { my $webDataLocation = shift; my @exists; my @notexists; foreach my $word (keys %seen) { if ( wiki_file_exists($webDataLocation,$word) ) { push (@exists, { wiki_link => $word, ref_cnt => $seen{$word} } ); # TBD: merge in $seenIn } else { push (@notexists, { wiki_link => $word, ref_cnt => $seen{$word} } ); } # sort by reference count } @exists = sort { $b->{ref_cnt} <=> $a->{ref_cnt} } @exists; @notexists = sort { $b->{ref_cnt} <=> $a->{ref_cnt} } @notexists; my @exists2; my @notexists2; foreach my $i ( @exists ) { push (@exists2, " * $i->{ref_cnt} : [[$i->{wiki_link}]] is referenced by : " . join(" ", sort keys %{$seenIn{$i->{wiki_link}}}) . "\n"); } foreach my $i ( @notexists ) { push (@notexists2, " * $i->{ref_cnt} : [[$i->{wiki_link}]] is referenced by : " . join(" ", sort keys %{$seenIn{$i->{wiki_link}}}) . "\n"); } $EXISTS = join ("", @exists2 ); $NOTEXISTS = join ("", @notexists2); } sub check_for_files_that_are_not_referenced { foreach my $fn (sort keys %files_seen) { if( ! $seen{$fn} ) { push(@files_that_are_not_referenced,$fn); } my $topic = $fn; $FILES_THAT_ARE_NOT_REFERENCED .= " * [[$topic]]\n"; } } sub print_report { my $DATE = `date`; print <[[ double bracket ]] and Web.Subweb.Topic and other !NonSimplyWikiWord links. It gets confused by cross web links. (Main.AndyGlew is fixing some of these problems, but even with these problems this script is useful.) *NB:* Do not add pages that are names of people. Those pages will be created when he or she registers in TWiki (in TWiki.TWikiRegistration). This also does not properly detect automatic plurals or some TWiki or Main web pages. Use this page to discover: * which important pages are missing (top of the list) * spelling errors in page links (bottom of the list) * i.e. the list is sorted by frequency of reference ---++ Pages that are referenced but which which do not exist $NOTEXISTS ---++ Pages that are not referenced by other pages. $FILES_THAT_ARE_NOT_REFERENCED ---++ Pages that are referenced and which exist. $EXISTS REPORT }