User:XLinkBot/Code/LinkParser.pl
Appearance
< User:XLinkBot | Code
#!/usr/bin/perl
fork and exit;
use POE qw (Component::Client::TCP);
use HTML::Entities;
use LWP::UserAgent;
use perlwikipedia;
use strict;
my $editor=Perlwikipedia->new("LinkParser","LinkParser");
my $diffFetcher=LWP::UserAgent->new;
$diffFetcher->agent("LinkParser/2.0");
my %settings;
$settings{'debug'} = 0;
print "Reading config file...\n" if $settings{'debug'};
open (CONFIG,"<linkwatcher-config") or die "Can't open LinkWatcher config: $!";
foreach (<CONFIG>) {
unless (/^#/) {
if(/(.+?)=(.+)/) {
$settings{$1}=$2;
}
}
}
close (CONFIG);
my @prefixes = split(/\|/,$settings{'prefixes'});
print ("Prefixes: " . join(" - ", @prefixes) . "\n") if $settings{'debug'};
print "done\n" if $settings{'debug'};
my $server_port=shift;
sleep 4;
POE::Component::Client::TCP->new(
RemoteAddress =>'127.0.0.1',
RemotePort => $server_port,
ServerInput => \&server_input,
Connected => \&connected,
);
my ($heap,$kernel);
my $number_of_edits=0;
POE::Kernel->run();
exit 0;
sub server_input {
my ( $session, $heap, $kernel, $input ) = @_[ SESSION, HEAP, KERNEL, ARG0 ];
if ($input =~ m{EDIT \[\[(.+)\]\] \[\[(.+):User:(.+?)\]\] (http:\/\/.+) (.+)}) {
my ($pagename) = $1;
my ($lang) = $2;
my ($username) = $3;
my ($diffurl) = $4;
my ($size) = $5;
$number_of_edits++;
my @linksadded;
my @linkremoved;
if ( $diffurl ) {
my @addedPre = ();
my @removedPre = ();
my @addedlinks = ();
my @removedlinks = ();
my $addedTotal = "";
my $removedTotal = "";
if ($diffurl =~ m/index\.php/) {
my $diffUrl="$diffurl&diffonly=1&action=render";
my $diffContent=$diffFetcher->get($diffUrl)->content;
print ("$diffContent\n") if $settings{'debug'};
@addedPre=$diffContent=~m/<td class=.diff-addedline.><div>(.*?)<\/div><\/td>/sg;
@removedPre=$diffContent=~m/<td class=.diff-deletedline.><div>(.*?)<\/div><\/td>/sg;
$addedTotal=join(' ', @addedPre);
$removedTotal=join(' ', @removedPre);
$addedTotal =~ s/<span class=.diffchange diffchange-inline.>//g;
$addedTotal =~ s/<span class=.diffchange.>//g;
$addedTotal =~ s/<\/span>//g;
$removedTotal =~ s/<span class=.diffchange diffchange-inline.>//g;
$removedTotal =~ s/<span class=.diffchange.>//g;
$removedTotal =~ s/<\/span>//g;
$addedTotal =~ s/<ins class=.diffchange diffchange-inline.>//g;
$addedTotal =~ s/<ins class=.diffchange.>//g;
$addedTotal =~ s/<\/ins>//g;
$removedTotal =~ s/<ins class=.diffchange diffchange-inline.>//g;
$removedTotal =~ s/<ins class=.diffchange.>//g;
$removedTotal =~ s/<\/ins>//sig;
$addedTotal = lc($addedTotal);
$removedTotal = lc($removedTotal);
print ("Added data: $addedTotal\n") if $settings{'debug'};
} else {
$addedTotal=$editor->get_text($pagename);
$addedTotal= lc($addedTotal);
$removedTotal = "";
}
decode_entities( $addedTotal );
decode_entities( $removedTotal );
@addedlinks=$addedTotal=~m{(http://[^\s\]\[\{\}\\\|^~`<>]+)}sgi;
@removedlinks=$removedTotal=~m{(http://[^\s\]\[\{\}\\\|^~`<>]+)}sgi;
my @really_added_links = ();
my @really_removed_links = ();
my $links_added;
my $links_removed;
if (@addedlinks) {
if (@removedlinks) {
print("----\nDIFF $diffurl ".join(" ",@addedlinks)." - ".join(" ",@removedlinks)."\n----\n") if $settings{'debug'};
foreach $links_added(@addedlinks) {
my $found = 0;
foreach $links_removed(@removedlinks) {
if ($links_removed eq $links_added) {
$found = 1;
}
}
unless ($found) {
push(@really_added_links,$links_added);
}
}
} else {
@really_added_links = @addedlinks;
}
print ("DIFF $diffurl ".join(" ",@really_added_links)."\n----\n") if $settings{'debug'};
}
if (@really_added_links) {
my $message="PARSED [[$pagename]] $diffurl $size [[$lang:User:$username]] |" . join(" ",@really_added_links) . "|";
$heap->{server}->put($message);
}
}
$heap->{server}->put("REQUEST");
}
elsif ($input =~ m{NOEDIT}) {
sleep 1;
$heap->{server}->put("REQUEST");
}
if ($number_of_edits>50) {
$kernel->post("shutdown");
exit 0;
}
}
sub connected {
( $kernel, $heap ) = @_[ KERNEL,HEAP ];
$heap->{server}->put("REQUEST");
}
sub request_edit {
( $kernel, $heap ) = @_[ KERNEL,HEAP ];
$heap->{server}->put("REQUEST");
}