#!/usr/bin/perl -w # DESCRIPTION: . # # Deletes lines in the text file with text: # # # # # # # # Also deletes: # \n...\n # # AUTHOR: Andrew Krizhanovsky (http://code.google.com/p/wikokit) # START DATE: 2009 # FINISH DATE: 2013 # SEE: # Talk:Xml2sql http://meta.wikimedia.org/wiki/Talk:Xml2sql # MySQL 2 SQLite http://www.perlmonks.org/index.pl?node_id=150476 # My project wiwordik: # http://code.google.com/p/wikokit/wiki/MySQL_import use strict; no strict 'refs'; use DBI; use Getopt::Std; my( $headline, $str_today, $fn_in, $fn_out, $fn_log, $fn_err, $fsize); my( $text); my( $log_text, $err_text); my( $synsets, $synwords, %unique_synwords); my( $i, $set, $w); $headline = "xml2sql_helper V0.04 (GNU-GPL) 2009-2013 AKA MBG \n"; # -------------------------------------------------------------- # subroutine help_exit # -------------------------------------------------------------- #wrap up in, s sub help_exit { if ($#ARGV != 1){ print "\n".$headline."\n". "Usage:\n xml2sql_helper in_dump.sql out_dump.sql\n". " in_dump.sql - dump of Wiki database (wiki-pages-articles.xml)\n". "Examples: xml2sql_helper wiki-pages-articles.xml out.xml"; } if (1 != $#ARGV){ close(STDOUT); # baffle banners ;) exit(0); } } help_exit; print "\n".$headline."\n"; print "Processing files ...\n"; # READ COMMAND LINE # -------------------------------------------------------------- # must be two arguments $fn_in = $ARGV[0]; $fn_log = $fn_in."_log"; $fn_err = $fn_in."_error"; $fn_out = $ARGV[1]; # open, copy files to buffer, close # h - FILE HANDLE # -------------------------------------------------------------- open (h_in,"<".$fn_in) or die ("Cannot open input file ".$fn_in); $fsize = (stat ($fn_in))[7]; # read h_in, $text, $fsize; # close (h_in); # print "Read ${fn_in}.\n"; open (h_out,">".$fn_out) or die ("Cannot open output file ".$fn_out); my @index; my @tablename; my $b_skip_mode = 0; # 1 == current lines belong to the body # LINE: while() { my($line) = $_; # file line chomp($line); if ($b_skip_mode) { $b_skip_mode = 0 if $line eq ""; next LINE; } else { $b_skip_mode = 1 if $line eq ""; } next LINE if $b_skip_mode; # remove lines with text "" # next LINE if $line =~ s/^\s*\//; next LINE if -1 ne index $line, ""; next LINE if -1 ne index $line, " # remove lines with text "", e.g. "8" next LINE if -1 ne index $line, ""; # remove lines with text "", e.g. "wikitext" next LINE if -1 ne index $line, ""; # remove lines with text "", e.g. "text/x-wiki" next LINE if -1 ne index $line, ""; # remove lines with text "", e.g. "5322" next LINE if -1 ne index $line, ""; # remove lines with text "" next LINE if -1 ne index $line, ""; next LINE if -1 ne index $line, ""; # e.g.: 0u1 # Print the line to the result file and add a newline print h_out $line."\n"; } EXIT: # close output&log file close (h_out); print "\n"; close(STDOUT); # baffle banners ;)