#!/usr/bin/perl
# $Id: mw2twiki.pl,v 1.2 2006/08/28 14:37:51 john Exp $

my $VERSION = '0.1';

use strict;
$|++;
use utf8;

use Getopt::Long;
use Pod::Usage;

my $title_test;
my $dsn;
my $verbose;
my $version;
my $user;
my $password;
my $replace;

$dsn = "DBI:Pg:dbname=mediawiki;host=localhost";
$user = 'john';
$password = 'reformed';

GetOptions(
	'dsn=s'					=>	\$dsn,
	'verbose!'				=>	\$verbose,
	'version!'				=>	\$version,
	'title-test!'			=>	\$title_test,
	'user=s'				=>	\$user,
	'password=s'			=>	\$password,
	'replace!'				=>	\$replace,

	'man!'					=>	sub { pod2usage( -verbose => 2 ) },
	'help!'					=>	sub { pod2usage( -verbose => 2 ) },
) || pod2usage( -verbose => 1, -message => 'invalid option' );

if ($version) {
	print "mw2twiki.pl version $VERSION\n";
	exit;
}

# The following are the values for page_namespace as I understand them.
# That means we are interested in namespaces 4 and 14 (the local wiki
# and categories.
#
# 0 (Main)
# 1 Talk
# 2 User
# 3 User talk
# 4 Local Wiki
# 5 Local Wiki talk
# 6 Image
# 7 Image talk
# 8 MediaWiki
# 9 MediaWiki talk
# 10 Template
# 11 Template talk
# 12 Help
# 13 Help talk
# 14 Category
# 15 Category talk

use DBI;

my $output_dir = shift @ARGV;

if (!$title_test) {
	pod2usage(
		-verbose	=>	1,
		-message	=>	"no output directory specified"
	) if !$output_dir;

	pod2usage(
		-verbose	=>	1,
		-message	=>	"output directory does not exist",
	) if ! -d "$output_dir";
}

my $db = DBI->connect(
	$dsn,
	$user,
	$password,
	{
		RaiseError				=>	1,
		ShowErrorStatement		=>	1,
		pg_enable_utf8			=>	1,
	}
) or die 'could not connect to database';

# The following SQL query is for a MediaWiki PostgreSQL database.
# Undoubtedly the syntax will change for a MySQL installation.
# You will want to check the WHERE clause before to see if
# will do what you want.  It worked for me.

# The following are the values for page_namespace as I understand them.
# That means we are interested in namespaces 0 and 14 (the main wiki,
# the local wiki and categories.  It might be that you are also
# interested in 4 as well. Take a look at your data

# 0 (Main)
# 1 Talk
# 2 User
# 3 User talk
# 4 Local Wiki
# 5 Local Wiki talk
# 6 Image
# 7 Image talk
# 8 MediaWiki
# 9 MediaWiki talk
# 10 Template
# 11 Template talk
# 12 Help
# 13 Help talk
# 14 Category
# 15 Category talk

$db->do( "SET search_path TO mediawiki, public" );
my $sth = $db->prepare( "
	SELECT
		*

	FROM
		page
		LEFT OUTER JOIN revision ON (page_id=rev_page)
		LEFT OUTER JOIN text ON (rev_text_id=old_id)

	WHERE
		page_restrictions NOT LIKE '%sysop%'
		AND page_namespace IN (0, 14)

	ORDER BY
		page_id, old_id"
);

$sth->execute();

my $last_page;
while (my $row = $sth->fetchrow_hashref()) {
	my $is_cat = $row->{page_namespace} == 14 ? 1 : 0;

	if ($verbose) {
		use Data::Dumper;
		#print STDERR Dumper( $row );
	}

	my $title = camelize_title( $row->{page_title} );

	$title = 'Category' . $title if $is_cat;

	my $file_name = "$output_dir/$title.txt";
	my $rcs_file = "$output_dir/$file_name,v";

	if ($title_test) {
		if ($title ne $last_page) {
			print "${$row}{page_title} --> $title\n";
			$last_page = $title;
		}
		next;
	}

	if ($title ne $last_page) {
		if ($last_page) {
			exec_command( "co '$output_dir/$last_page.txt,v'" );
		}
		if (-f "$file_name") {
			if ($replace) {
				exec_command( "rm '$file_name'" );
			} else {
				die "$file_name already exists.";
			}
		}
		if (-f "$rcs_file") {
			if ($replace) {
				exec_command( "rm '$rcs_file'" );
			} else {
				die "$rcs_file already exists.";
			}
		}
	}

	my $user = $row->{rev_user_text};
	# could be an IP address so convert any dots to dashes
	$user =~ s/\./-/g;
	# RCS doesn't seem to like spaces in usernames
	$user =~ s/ /_/g;

	my $msg = $row->{rev_comment} ? $row->{rev_comment} : 'none';
	$msg =~ s/(["\$])/\$1/g;

	my $text = translate_text( $row->{old_text} );
	if ($is_cat) {
		$text .= qq[\n%SEARCH{"$title" nosearch="on" nosummary="on"}%];
	}

	open PAGE, ">", "$file_name" or die "Unable to open $file_name";
	binmode PAGE, ":utf8";
	print PAGE $text;
	close PAGE;

	if ($title eq $last_page) {
		exec_command( "rcs -l '$rcs_file'" );
	}

	exec_command( "echo 'none' | ci -w'$user' -d'${$row}{rev_timestamp}' -m\"$msg\" '$file_name'" );

	#exec_command( "USER=$user; rcs -u '$rcs_file'" );

	$last_page = $title;
}

if ($last_page && !$title_test) {
	exec_command( "co '$last_page.txt,v'" );
}


sub camelize_title {
	my ($mw_title) = @_;

	my $title = $mw_title;
	
	# just remove quotes from titles, they used as separators
	$title =~ s/['"]//g;

	my @parts = split /[^abcdefghijklmnopqrstuvwxyz\d]/i, $title;
	$title = '';
	foreach my $p (@parts) {
		# Get the case of multiple caps together
		my $caps = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
		my $lows = 'abcdefghijklmnopqrstuvwxyz';
		while ($p =~ /([$caps]{2,})([$caps][$lows])/) {
			my $w = ucfirst( lc( $1 ) ) . $2;
			$p =~ s/$1$2/$w/;
		}

		while ($p =~ /([$caps]{2,})/) {
			my $w = ucfirst( lc( $1 ) );
			$p =~ s/$1/$w/;
		}

		if ($p =~ /(\w+)(\d+)/) {
			my $w = ucfirst( lc( $1 ) );
			my $d = $2;
			$p =~ s/\w+\d+/$w$d/;
		}
		if ($p =~ /(\d+)(\w+)/) {
			my $w = ucfirst( lc( $2 ) );
			my $d = $1;
			$p =~ s/\d+\w+/$d$w/;
		}

		$title .= ucfirst( $p );
	}

	print STDERR "Page Title: $mw_title --> $title\n" if $verbose;

	return $title;
}


sub title_test {
	my @tests = qw( INB1 SimpleCMS MBHS-CCAT This_is_A_test John's_Books );

	map { print "$_ --> " . camelize_title( $_ ) . "\n"; } @tests;
}


sub exec_command {
	my ($cmd) = @_;

	print STDERR "$cmd\n" if $verbose;

	return `$cmd`;
}


sub translate_text {
	my ($mw_text) = @_;

	# This list was simply pasted from Plugins.MediawikiEditSyntaxRegex page at twiki.org
	# It is incomplete and does not support:
	# * Tables
	# * definition lists
	# * compound lists like *#* or *#
	#
    # * X2T: /(^|[\n\r])\; *([^ ]*) ?\:/$1   \$ $2\:/ # ; definition : term
	my $re_list = q~;
   * X2T: /(^|[\n\r])\; *([^:]*):/$1   \$ $2\:/ # ; definition : term
   * X2T: /(^|[\n\r])[\#\*]{9}\# /$1                              1. / # level 10 bullet
   * X2T: /(^|[\n\r])[\#\*]{8}\# /$1                           1. / # level 9 bullet
   * X2T: /(^|[\n\r])[\#\*]{7}\# /$1                        1. / # level 8 bullet
   * X2T: /(^|[\n\r])[\#\*]{6}\# /$1                     1. / # level 7 bullet
   * X2T: /(^|[\n\r])[\#\*]{5}\# /$1                  1. / # level 6 bullet
   * X2T: /(^|[\n\r])[\#\*]{4}\# /$1               1. / # level 5 bullet
   * X2T: /(^|[\n\r])[\#\*]{3}\# /$1            1. / # level 4 bullet
   * X2T: /(^|[\n\r])[\#\*]{2}\# /$1         1. / # level 3 bullet
   * X2T: /(^|[\n\r])[\#\*]{1}\# /$1      1. / # level 2 bullet
   * X2T: /(^|[\n\r])\# /$1   1. / # level 1 bullet
   * X2T: /(^|[\n\r])[\#\*]{9}\* /$1                              \* / # level 10 bullet
   * X2T: /(^|[\n\r])[\#\*]{8}\* /$1                           \* / # level 9 bullet
   * X2T: /(^|[\n\r])[\#\*]{7}\* /$1                        \* / # level 8 bullet
   * X2T: /(^|[\n\r])[\#\*]{6}\* /$1                     \* / # level 7 bullet
   * X2T: /(^|[\n\r])[\#\*]{5}\* /$1                  \* / # level 6 bullet
   * X2T: /(^|[\n\r])[\#\*]{4}\* /$1               \* / # level 5 bullet
   * X2T: /(^|[\n\r])[\#\*]{3}\* /$1            \* / # level 4 bullet
   * X2T: /(^|[\n\r])[\#\*]{2}\* /$1         \* / # level 3 bullet
   * X2T: /(^|[\n\r])[\#\*]{1}\* /$1      \* / # level 2 bullet
   * X2T: /(^|[\n\r])\* /$1   \* / # level 1 bullet
   * X2T: /\[\[Image\:([^\]]*)\]\]/\%ATTACHURL\%\/$1/ # [[Image:file.png]]
   * X2T: /\[\[([^\|]*)\|(\1)\]\]/\[\[$1\]\]/ # internal link [[WikiWord|WikiWord]]
   * X2T: /\[\[([^\|]*)\|([^\]]*?)\]\]/\[\[$1\]\[$2\]\]/ # internal link [[WikiWord|label]]
   * X2T: /(?=[^\]])\[(https?\:.*?) (.*?)\](?=[^\]])/\[\[$1\]\[$2\]\]/ # external link [http:... label]
   * X2T: /(^|[\n\r])======(.*?) ?======(?=[\n\r]|$)/$1---\+\+\+\+\+\+$2/ # H6
   * X2T: /(^|[\n\r])=====(.*?) ?=====(?=[\n\r]|$)/$1---\+\+\+\+\+$2/ # H5
   * X2T: /(^|[\n\r])====(.*?) ?====(?=[\n\r]|$)/$1---\+\+\+\+$2/ # H4
   * X2T: /(^|[\n\r])===(.*?) ?===(?=[\n\r]|$)/$1---\+\+\+$2/ # H3
   * X2T: /(^|[\n\r])==(.*?) ?==(?=[\n\r]|$)/$1---\+\+$2/ # H2
   * X2T: /(^|[\n\r])=(.*?) ?=(?=[\n\r]|$)/$1---\+$2/ # H1
   * X2T: /(^|[\s\(])<tt>([^ ].*?[^ ])<\/tt>([\s\)\.\,\:\;\!\?]|$)/$1=$2=$3/ # monospaced
   * X2T: /(^|[\s\(])'''<tt>([^ ].*?[^ ])<\/tt>'''([\s\)\.\,\:\;\!\?]|$)/$1==$2==$3/ # monospaced bold
   * X2T: /(^|[\s\(])'''([^ ].*?[^ ])'''([\s\)\.\,\:\;\!\?]|$)/$1\*$2\*$3/ # bold
   * X2T: /(^|[\s\(])''<b>([^ ].*?[^ ])<\/b>''([\s\)\.\,\:\;\!\?]|$)/$1\_\_$2\_\_$3/ # italic bold
   * X2T: /(^|[\s\(])''([^ ].*?[^ ])''([\s\)\.\,\:\;\!\?]|$)/$1\_$2\_$3/ # italic
~;

	my $text = "$mw_text\n";


	#$text =~ s/(\r\n)/\n/g;
	#$text =~ s/\n\r/\n/g;
	#$text =~ s/\r/\n/g;

	$text =~ s/^( +.+)$/<verbatim>\n$1\n<\/verbatim>/mg;
	#$text =~ s/<\/verbatim>\n<verbatim>//g;
	$text =~ s/\n<\/verbatim>\n([\n\s]*)<verbatim>/$1/g;

    my $regex = '(\/.*?\/.*?\/)( +\#| *$)';
    my @rules =
      map{ s/.*?\* X2T\: *$regex.*/s${1}g/; $_ }
      grep{ /\* X2T\: *$regex/ }
      split( /[\n\r]+/, $re_list );

	foreach my $re (@rules) {
		eval( "\$text =~ $re;" );
	}

	$text =~ s/[\n\r]+$//;

	while ($text =~ /\[{2}Category:\s*([^\]+]+)\]{2}/i) {
		my $cat = 'Category' . camelize_title( $1 );

		$text =~ s/\[{2}Category:\s*([^\]+]+)\]{2}/[[$cat]]/i;
	}

	#foreach (split /\n/, $text) {
	#	print "'$_'\n";
	#}

	return $text;
}


__END__


=head1 NAME

mw2twiki.pl - convert MediaWiki pages to TWiki with history


=head1 SYNOPSIS

mw2twiki.pl [options] output_directory


=head1 DESCRIPTION

This program converts MediaWiki pages including their history to TWiki formatted
pages.  It not intended for general use.  In order to use this program effectively
you should be familiar with RCS archives, databases, and PERL programming.

==head2 Note:

=over

=item *

The TWiki RCS archives are created in the output directory.  One RCS repository
file (a file ending in ',v') will be created for each page in your MediaWiki wiki.

=item *

RCS files (*.txt,v) and TWiki page files (*.txt) will need to have their
permissions and owners set to appropriate values before accessing this script.

=item *

Page history is created using RCS commands L<rcs>, L<ci>, and L<co> so these
need to be installed.

=item *

The conversion of MediaWiki formatted text to TWiki format is B<very> rough at this
point.  See B<TODO> section below.

=item *

Page title camelization assumes Mediawiki page titles are ASCII characters.  If
you have page titles with other encodings use the B<--title-test> option to see
what will happen to your titles.

=item *

It does not convert talk about pages.

=item *

Mediawiki page redirects are probably not handled properly

=item *

Mediawiki page namespaces (the part before the colon) are currently ignored.  So
the page name Hardware:Routers is converted to a TWiki page named HardwareRouters.

=back

=head1 OPTIONS

=over

=item B<--dsn>=dsn

a DSN descriptor as defined in the L<DBI> man page.  A default value can be set by
modifying the program code

=item B<--user>=username

user name to use to log into the database server

=item B<--password>=password

password used to log into the database server

=item B<--replace>

Danger Will Robinson! Using this switch will overwrite existing RCS archives and
text with the converted version from the MediaWiki database.

=item B<--title-test>

this option is for testing the MediaWiki page titles camelization routine.  It prints
the camelized page titles of all MediaWiki pages.  It is probably a good idea to run
this first to make sure nothing freaky is being done to your page titles.

=item B<--verbose>

outputs verbose comments on STDERR

=item B<--version>

output program version number and exit

=back

=head1 TODO

=over

=item *

add more error checking.  There isn't much at this point.

=item *

improve the MediaWiki to TWiki format conversion.  It is currently known not to support
tables and nested ordered/unordered lists, verbatim sections (ouch).

=item *

implement conversion of MediaWiki categories

=back


=head1 AUTHOR

John Supplee


=head1 BUGS

Report any bugs to L<john@supplee.com>