#!/usr/bin/perl -w # # Migrate from phpwiki to twiki # # Use phpwiki dump tool, dump as raw text # # Juan Grigera # ver 0.1 - 20050502 # ver 0.2 - 20050809 # Kevin Seghetti added: # Added reading of source and destination directories from the command line # Extraction of only the latest revision from a full backup (reads first mime chunk, discards the rest). # Fixed bug where HEADERS weren't being cleared before each file # Added filtering out of DOS line endings (don't know if all phpwiki backups have them, but mine did) # Conversion of %%% to %BR% # removal of %## in file names (usually used for spaces), uppercasing of first letter after each space # insertion of a space after * in lists # changed from removing forced links to converting them to twiki forced links # fixed a bug in readHeader where it was expected that a failed match would undefine $1 (it doesn't) # ver 0.3 - 20050907 # George Stojanoff (GCS) added: # - The fix filename code (used in convertFile()) has been pulled into its own subroutine FixFilename # - Added the FixLink() subroutine which is an attempt at a smarter conversion of link names. This # needs more work especially with respect to handling all of the "special" characters. Currently we # only handle "space letter", "dot letter", "single-quote, letter" and "ampersand letter". FixLink() # is called from filterWiki() when converting [link] to [[link]] and [description|link] to # [[link][description]]. # - Improved the way lists (bullet and numbered) are converted. List conversion used to lose all # structure e.g. every phpwiki list was converted into a flat twiki list. We now convert by guessing # that every 2 spaces is an indent. At worst this will create too many indents but at least the # general structure is mostly preserved. # - Added wiki word escaping syntax conversion of phpwiki ~ to twiki ! # use strict; use Data::Dumper; my $VERSION = '0.3'; use constant DEBUG => 0; my %HEADERS; my $PHPWIKI_DUMP = shift; my $DEST_DIR = shift; if(!defined $DEST_DIR) { print STDERR "usage: php2twiki \n"; exit(1); } my $boundary; # Iterate dump my $file; opendir(DIR, $PHPWIKI_DUMP) || die "can't opendir $PHPWIKI_DUMP: $!"; while($file = readdir(DIR)) { convertFile("$file") if -f "$PHPWIKI_DUMP/$file"; } closedir DIR; # kts I am sure there is a simpler way to do this, but I don't know how sub UpperCaseString { my $value = shift; $value =~ tr/a-z/A-Z/; return $value; } # # fix filename, used in convertFile() sub FixFilename { my ($file) = @_; # convert phpwiki HomePage to twiki WebHome $file = "WebHome" if ($file eq "HomePage"); # make sure first character is upper case $file =~ s/^(\w)/UpperCaseString($1)/e; # convert any "dot letter" to "upper case letter" in the filename while($file =~ s/\.(.)/UpperCaseString($1)/eg) {}; # convert any "% digit digit letter" to "upper case letter" in the filename # since the supstitution character may be another %, we have to re-run this until there are 0 matches while($file =~ s/%..(.)/UpperCaseString($1)/eg) {}; # catch any %dd at the end of the string $file =~ s/%..$//; return $file; } # # fix linkname, used in filterWiki() # GCS: attempt at a smarter conversion of link names. This needs more work # especially with respect to handling all of the "special" characters # Currently we only handle "space letter", "dot letter" # "single-quote, letter" and "ampersand letter". FixLink() # is called from filterWiki() when converting [link] to [[link]] and # [description|link] to [[link][description]]. sub FixLink { my ($file) = @_; if ($file =~ m/^http:\/\/.*/) { return $file; } # convert phpwiki HomePage to twiki WebHome $file = "WebHome" if ($file eq "HomePage"); # make sure first character is upper case $file =~ s/^(\w)/UpperCaseString($1)/e; # convert any "space letter" to "upper case letter" while($file =~ s/ (.)/UpperCaseString($1)/eg) {}; # convert any "dot letter" to "upper case letter" while($file =~ s/\.(.)/UpperCaseString($1)/eg) {}; # convert any "single-quote letter" to "upper case letter" while($file =~ s/'(.)/UpperCaseString($1)/eg) {}; # convert any "ampersand letter" to "upper case letter" while($file =~ s/'(.)/UpperCaseString($1)/eg) {}; # to do: deal with other special characters ... return $file; } # # convert a File sub convertFile { my ($file) = @_; my $flag_header = 1; my $cont = 1; my ($HANDLE, $OUTPUT); open($HANDLE, "$PHPWIKI_DUMP/$file") || die "can't open read file $file: $!";; # convert to an acceptable twiki filename $file = FixFilename($file); open($OUTPUT, "> $DEST_DIR/$file.txt") || die "can't open dest file $file $!"; my %foo; %HEADERS = %foo; # clear out any headers from last file while(<$HANDLE>) { next if(!$cont); # Read headers if($flag_header) { if($_ =~ /^\r?$/) { # if header is mixed, then the latest revision header follows, just fall through to it if($HEADERS{'Content-Type'} ne 'multipart/mixed;') { $flag_header = 0; # Print header print $OUTPUT "\%META:TOPICINFO{"; print $OUTPUT "author=\"". $HEADERS{'author'} if(defined $HEADERS{'author'}); print $OUTPUT '" date="' . $HEADERS{'lastmodified'} if(defined $HEADERS{'lastmodified'}); print $OUTPUT '" format="1.0" version="1.1"}%' . "\n"; if(defined $HEADERS{'boundary'} ) { $boundary = $HEADERS{'boundary'}; $boundary =~ s/^"/--/; $boundary =~ s/"$//; } #print "done reading header, values are ", Dumper(\%HEADERS), "\n"; } } else { readHeader($_); } } else { $_ =~ s/\r$//; my $temp = $_; chomp $temp; if(defined $boundary && ($temp eq $boundary)) { $cont = 0; } else { filterWiki($_, $OUTPUT); } } } close($OUTPUT); close($HANDLE); } # # # parse phpwiki Header sub readHeader { my ($line) = @_; my $key; my $value; my $count = $line =~ m/^([^:]+): (.*)\r?$/; if($count > 0) { $key = $1; $value = $2; } else { my $count = $line =~ m/^\s*([^=]+)=(.*)\r?$/; $key = $1 if ($count > 0); $value = $2 if ($count > 0); } $value =~ s/\r|\n//g; $HEADERS{$key} = $value; } # # # Filter php-Wiki format my $verbatim = 0; sub filterWiki { my ($line, $OUTPUT) = @_; # check verbatim if($line =~ m/
/) {
   	$line =~ s/
//;
	$verbatim = 1;
   } 
   if($verbatim && not($line =~ m/<\/pre>/)) {
   	print $OUTPUT $line;
	return;
   }

   if($line =~ m/<\/pre>/)   {
   	$line =~ s/<\/pre>/<\/verbatim>/;
	$verbatim = 0;
   }

   # Lists: * for bullet lists, 
   # GCS: deal with indents by guessing 1 tab every 2 spaces
   #      at worst, this will create too many indents but 
   #      the general structure should be preserved
   #      There is probably a more Perl elegant way to do this ...
   if (m/^(\s*)\*(.*)/) {
       my $numTabs = length($1) / 2;
       $numTabs += 1;
       $line = "";
       for (my $i=0;$i<$numTabs;$i++) {
	   $line .= "\t";
       }
       $line .= "* $2\n";
   }

   # # for numbered lists, 
   # GCS: deal with indents by guessing 1 tab every 2 spaces
   #      at worst, this will create too many indents but 
   #      the general structure should be preserved
   #      There is probably a more Perl elegant way to do this ...
   if (m/^(\s*)#(.*)/) {
       my $numTabs = length($1) / 2;
       $numTabs += 1;
       $line = "";
       for (my $i=0;$i<$numTabs;$i++) {
	   $line .= "\t";
       }
       $line .= "1. $2\n";
   }

   # Titulos
   $line =~ s/^!!!/---+/;
   $line =~ s/^!!/---++/;
   $line =~ s/^!/---+++/;

   # line breaks
   $line =~ s/%%%/%BR%/g;

   # Tables
   if($line =~ m/\|\s*$/) {
        $line =~ s/\r\n//;
	$line = "|" . $line;
   }

   # Forced links
   # GCS: Convert [link] to [[link]] and [description|link] to [[link][description]]
   if ($line =~ s/\[\s*([^|\[]+)\|\s*([^\]]+)\]/"[[".FixLink($2)."][".$1."]]"/eg) {
   } else {
      $line =~ s/\[\s*([^\[]*)\s*]/"[[".FixLink($1)."]]"/eg;
   };

   # GCS: WikiWord escaping (convert ~ to !)
   $line =~ tr/\~/!/;

   print $OUTPUT $line;
}