#!/usr/bin/perl use strict; use DBI; my ($host, $start_page, $user, $dbase, $pass) = @ARGV; die "Usage: $0 [start_page] [database] [user] [password]" unless ($host); $start_page ||= 'Main_Page'; $dbase ||= 'wikidb'; $user ||= 'root'; my $db = DBI->connect("DBI:mysql:database=$dbase;host=$host", $user, $pass); my $page = get_page($db, $start_page); transform_page($page, translate_page($page)); output_page($page); system "ci -u -t-'Intitial revision imported from MediaWiki' *.txt"; chmod 0644, glob("*.txt"); sub get_page { my ($db, $title, $namespace) = @_; my $cmd; # First find the page_id of the page if ($namespace) { $cmd = $db->prepare("SELECT * from mw_page where page_title = ?" . "AND page_namespace = ?"); $cmd->execute($title, $namespace); } else { $cmd = $db->prepare("SELECT * from mw_page where page_title = ?"); $cmd->execute($title); } return undef unless $cmd->rows; my $pg = $cmd->fetchrow_hashref(); $cmd = $db->prepare("SELECT * from mw_revision where rev_id = ?"); $cmd->execute($pg->{page_latest}); die "Couldn't find last revision $pg->{page_latest} for page $title" unless $cmd->rows; my $rev = $cmd->fetchrow_hashref(); $cmd = $db->prepare("SELECT * from mw_text where " . "old_id = $rev->{rev_text_id}"); $cmd->execute; die "Couldn't find last revision $pg->{page_latest} for page $title" unless $cmd->rows; my $ret = { title => $title, text => $cmd->fetchrow_hashref()->{old_text}, namespace => $pg->{page_namespace}, links => [] }; $cmd = $db->prepare("SELECT * from mw_pagelinks where " . "pl_from = $pg->{page_id}"); $cmd->execute; while (my $link = $cmd->fetchrow_hashref()) { my $sub_page = get_page($db, $link->{pl_title}, $link->{pl_namespace}); push @{$ret->{links}}, { title => $link->{pl_title}, page => $sub_page } if $sub_page; } $ret; } sub translate_page { my ($page, $file_transforms) = @_; $file_transforms ||= []; if ($page->{title} =~ /[\._']/) { $page->{link_name} = $page->{title}; $page->{title} =~ s/_/ /g; $page->{link_name} =~ s/[\.']//g; push @$file_transforms, [ $page->{title}, $page->{link_name} ]; } # These regular expressions were snarfed from: # http://twiki.org/cgi-bin/view/Plugins/MediawikiEditSyntaxRegex $_ = $page->{text}; s/(^|[\n\r])\; *([^:]*):/$1 \$ $2\:/g; # ; definition : term s/(^|[\n\r])[\#\*]{9}\# /$1 1. /g; # level 10 bullet s/(^|[\n\r])[\#\*]{8}\# /$1 1. /g; # level 9 bullet s/(^|[\n\r])[\#\*]{7}\# /$1 1. /g; # level 8 bullet s/(^|[\n\r])[\#\*]{6}\# /$1 1. /g; # level 7 bullet s/(^|[\n\r])[\#\*]{5}\# /$1 1. /g; # level 6 bullet s/(^|[\n\r])[\#\*]{4}\# /$1 1. /g; # level 5 bullet s/(^|[\n\r])[\#\*]{3}\# /$1 1. /g; # level 4 bullet s/(^|[\n\r])[\#\*]{2}\# /$1 1. /g; # level 3 bullet s/(^|[\n\r])[\#\*]{1}\# /$1 1. /g; # level 2 bullet s/(^|[\n\r])\# /$1 1. /g; # level 1 bullet s/(^|[\n\r])[\#\*]{9}\* /$1 \* /g; # level 10 bullet s/(^|[\n\r])[\#\*]{8}\* /$1 \* /g; # level 9 bullet s/(^|[\n\r])[\#\*]{7}\* /$1 \* /g; # level 8 bullet s/(^|[\n\r])[\#\*]{6}\* /$1 \* /g; # level 7 bullet s/(^|[\n\r])[\#\*]{5}\* /$1 \* /g; # level 6 bullet s/(^|[\n\r])[\#\*]{4}\* /$1 \* /g; # level 5 bullet s/(^|[\n\r])[\#\*]{3}\* /$1 \* /g; # level 4 bullet s/(^|[\n\r])[\#\*]{2}\* /$1 \* /g; # level 3 bullet s/(^|[\n\r])[\#\*]{1}\* /$1 \* /g; # level 2 bullet s/(^|[\n\r])\* /$1 \* /g; # level 1 bullet s/\[\[Image\:([^\]]*)\]\]/\%ATTACHURL\%\/$1/g; # [[Image:file.png]] s/\[\[([^\|]*)\|(\1)\]\]/\[\[$1\]\]/g; # internal link [[WikiWord|WikiWord]] s/\[\[([^\|]*)\|([^\]]*?)\]\]/\[\[$1\]\[$2\]\]/g; # internal link [[WikiWord|label]] s/(?=[^\]]|^)\[(https?\:.*?) (.*?)\](?=[^\]]|$)/\[\[$1\]\[$2\]\]/g; # external link [http:... label] s/(^|[\n\r])======(.*?) ?======(?=[\n\r]|$)/$1---\+\+\+\+\+\+$2/g; # H6 s/(^|[\n\r])=====(.*?) ?=====(?=[\n\r]|$)/$1---\+\+\+\+\+$2/g; # H5 s/(^|[\n\r])====(.*?) ?====(?=[\n\r]|$)/$1---\+\+\+\+$2/g; # H4 s/(^|[\n\r])===(.*?) ?===(?=[\n\r]|$)/$1---\+\+\+$2/g; # H3 s/(^|[\n\r])==(.*?) ?==(?=[\n\r]|$)/$1---\+\+$2/g; # H2 s/(^|[\n\r])=(.*?) ?=(?=[\n\r]|$)/$1---\+$2/g; # H1 s/(^|[\s\(])([^ ].*?[^ ])<\/tt>([\s\)\.\,\:\;\!\?]|$)/$1=$2=$3/g; # monospaced s/(^|[\s\(])'''([^ ].*?[^ ])<\/tt>'''([\s\)\.\,\:\;\!\?]|$)/$1==$2==$3/g; # monospaced bold s/(^|[\s\(])'''([^ ].*?[^ ])'''([\s\)\.\,\:\;\!\?]|$)/$1\*$2\*$3/g; # bold s/(^|[\s\(])''([^ ].*?[^ ])<\/b>''([\s\)\.\,\:\;\!\?]|$)/$1\_\_$2\_\_$3/g; # italic bold s/(^|[\s\(])''([^ ].*?[^ ])''([\s\)\.\,\:\;\!\?]|$)/$1\_$2\_$3/g; # italic # Replace the page content $page->{text} = $_; # ... and recurse foreach my $link (@{$page->{links}}) { translate_page($link->{page}, $file_transforms); } $file_transforms; } sub transform_page { my ($page, $transform_list) = @_; foreach my $xform (@$transform_list) { my ($from, $to) = @$xform; # We need to fixup the input side of the RE to escape the various # special RE's. my $source = $from; $source =~ s/\./\\./g; $source =~ s/\(/\\\(/g; $source =~ s/\)/\\\)/g; $page->{text} =~ s/\[\[$source\]\]/\[\[$to\]\[$from\]\]/g; } # Place a title at the beginning of the page, like MediaWiki would # have automatically done $page->{text} = "---+!! $page->{title}\n$page->{text}"; # ... and recurse foreach my $link (@{$page->{links}}) { transform_page($link->{page}, $transform_list); } } sub output_page { my $page = shift; my $file = $page->{link_name} || $page->{title}; open(OUT, "> $file.txt"); print OUT $page->{text}; close(OUT); # ... and recurse foreach my $link (@{$page->{links}}) { output_page($link->{page}); } }