#!/usr/bin/env perl use strict; my $webDataDirectory = "/nfs/pdx/disks/nhm.arch.futwiki.1/twiki/data"; my $recursive_option = 0; for(my $i=0; $i <= $#ARGV; $i++) { #print "argv=$ARGV[$i]\n"; if( 0 ) { } elsif( $ARGV[$i] eq "-directory" ) { $webDataDirectory = $ARGV[++$i]; } elsif( $ARGV[$i] eq "-r" || $ARGV[$i] eq "-recurse" ) { $recursive_option = 1; } else { die "unrecognized argument $ARGV[$i]"; } } my %files_seen; my %seen; my %seenIn; scan_files_in_directory($webDataDirectory); check_for_wiki_file_existence( $webDataDirectory ); my @files_that_are_not_referenced; my $FILES_THAT_ARE_NOT_REFERENCED; check_for_files_that_are_not_referenced(); print_report(); exit(0); sub is_subdirectory { my($dir,$file) = @_; if( $file eq "." ) { return 0; } if( $file eq ".." ) { return 0; } if( -d "$dir/$file" ) { return 1; } # TBD symlinks return 0; } sub ignore_subdirectory { my $file = shift; if( $file eq "CVS" ) { return 1; } if( $file =~ m|.*/CVS$| ) { return 1; } return 0; } sub scan_files_in_directory { my $webDataLocation = shift; opendir(WEBDIR, $webDataLocation); my @subdirs = (); while(my $file=readdir(WEBDIR)) { if( is_subdirectory($webDataLocation,$file) && ! ignore_subdirectory("$webDataLocation/$file") ) { # TBD: maybe just use Perl/CPAN's Find package if( $recursive_option ) { push(@subdirs,"$webDataLocation/$file"); } next; } scan_wiki_file($webDataLocation,$file); } # Handle subdirs # TBD: maybe better to just use Perl/CPAN Find foreach my $subdir ( @subdirs ) { #print "recursing into subdirectory $subdir\n"; scan_files_in_directory($subdir); } } sub scan_wiki_file { my($webDataLocation,$file) = @_; return unless ($file =~ /\.txt$/); # TBD: set ignore list on command line my $simplefile = $file; $simplefile =~ s/\.txt$//; $files_seen{$simplefile}++; open(IN, "$webDataLocation/$file"); my $slurp= join (" ", ); $slurp =~ s/\t/ /g; $slurp =~ s/\n/ /g; $slurp =~ s/\r/ /g; $slurp =~ s/\bMain\.[A-Z][a-zA-Z0-9]+\b/ /g; # Try to get rid of UserNames $slurp =~ s/\bTWiki\.[A-Z][a-zA-Z0-9]+\b/ /g; # Try to get rid TWiki stuff $slurp =~ s/%[A-Z][^%]+%/ /g; # Try to get rid of %VARIABLES% $slurp =~ s/[^a-zA-Z0-9 ]/ /g; $slurp =~ s/\s+/ /g; foreach my $word (split(/\s+/, $slurp)) { if ($word =~ /^[A-Z]+[^A-Z]+[A-Z]+[^A-Z]+$/) { $seen{$word}++; $seenIn{$word}{$simplefile}++; } } close IN; } my $EXISTS; my $NOTEXISTS; sub check_for_wiki_file_existence { my $webDataLocation = shift; my @exists; my @notexists; foreach my $word (keys %seen) { if ( -e "$webDataLocation/$word.txt") { push (@exists, { wiki_link => $word, ref_cnt => $seen{$word} } ); # TBD: merge in $seenIn } else { push (@notexists, { wiki_link => $word, ref_cnt => $seen{$word} } ); } # sort by reference count } @exists = sort { $b->{ref_cnt} <=> $a->{ref_cnt} } @exists; @notexists = sort { $b->{ref_cnt} <=> $a->{ref_cnt} } @notexists; my @exists2; my @notexists2; foreach my $i ( @exists ) { push (@exists2, " * $i->{ref_cnt} : $i->{wiki_link} is referenced by : " . join(" ", sort keys %{$seenIn{$i->{wiki_link}}}) . "\n"); } foreach my $i ( @notexists ) { push (@notexists2, " * $i->{ref_cnt} : $i->{wiki_link} is referenced by : " . join(" ", sort keys %{$seenIn{$i->{wiki_link}}}) . "\n"); } $EXISTS = join ("", @exists2 ); $NOTEXISTS = join ("", @notexists2); } sub check_for_files_that_are_not_referenced { foreach my $fn (sort keys %files_seen) { if( ! $seen{$fn} ) { push(@files_that_are_not_referenced,$fn); } $FILES_THAT_ARE_NOT_REFERENCED .= " * $fn\n"; } } sub print_report { my $DATE = `date`; print <[[ double bracket ]] and Web.Subweb.Topic and other !NonSimplyWikiWord links. It gets confused by cross web links. (Main.AndyGlew is fixing some of these problems, but even with these problems this script is useful.) *NB:* Do not add pages that are names of people. Those pages will be created when he or she registers in TWiki (in TWiki.TWikiRegistration). This also does not properly detect automatic plurals or some TWiki or Main web pages. Use this page to discover: * which important pages are missing (top of the list) * spelling errors in page links (bottom of the list) * i.e. the list is sorted by frequency of reference ---++ Pages that are referenced but which which do not exist $NOTEXISTS ---++ Pages that are not referenced by other pages. $FILES_THAT_ARE_NOT_REFERENCED ---++ Pages that are referenced and which exist. $EXISTS REPORT }