User:HBC Searchbot/Source
Appearance
See also templates.pl
use strict;
use LWP::UserAgent;
use HTTP::Request::Common;
use XML::Simple;
use URI::Escape;
my %revisions = get_complete_history('User talk:HighInBC');
open(PL,'templates.pl');
sysread(PL,my $templates,-s(PL));
close(PL);
my(%templates);
eval($templates) || die;
my $searches;
warn ("Checking ".scalar(keys(%revisions))." revisions.\n");
my(%warnings);
foreach my $revision (sort {$a <=> $b} keys(%revisions))
{
my $rh_rev = $revisions{$revision};
my $text = ${$rh_rev}{'text'}{'content'};
foreach my $template (sort keys(%templates))
{
$searches++;
my $start = index(lc($text), lc($templates{$template}));
if ($start > 0)
{
my $end = (index($text,"\n",$start) - $start-1);
$end = (length($text)-$start) if ($end < 0);
my $string = substr($text,$start,$end);
$warnings{$string}{'template'} = $template;
$warnings{$string}{'regex'} = $templates{$template};
push(@{$warnings{$string}{'revisions'}},$revision);
}
}
}
warn "$searches searches performed.\n";
warn Dumper(\%warnings);
sub get_complete_history
{
mkdir('cache') unless (-d('cache'));
my $page = shift;
my(%revisions);
my $count;
my $offset;
my $fname = 'cache/'.uri_escape($page);
if (-f($fname))
{
warn "found $fname in cache, loading\n";
open(IN,$fname);
sysread(IN,my $code,-s(IN));
close(IN);
my $VAR1; eval($code); %revisions = %{$VAR1};
my(@keys) = sort {$a <=> $b} keys(%revisions);
$offset = ($revisions{$keys[scalar(@keys)-1]}{'timestamp'});
warn (scalar(keys(%revisions))." loaded from cache.\n");
}
else
{
warn "No cache, starting fresh.\n";
$offset = '0';
}
my $total;
GETMORE:
warn "Downloading 100 revisions.\n";
my $ua = LWP::UserAgent->new('agent' => 'HighInBC warning checker .01b');
my $index = 'http://en.wikipedia.org/w/index.php';
my $res = $ua->request
(
POST $index."?title=Special:Export",
Content_Type => 'application/x-www-form-urlencoded',
Content => [(
'pages' => $page,
'action' => 'submit',
'submit' => 'Export',
'limit' => 100,
'offset' => $offset
)]
);
my $current = $res->content();
unless ($current =~ m|^<mediawiki|)
{
warn "Failed somehow, trying again\n";
goto GETMORE;
}
my $index = rindex($current, '<timestamp>');
my $string = substr($current,$index,43);
$string =~ m|<timestamp>(.+?)</timestamp>|;
$offset = $1;
my $xml_data = XMLin($current);
$count = 0;
if (${$xml_data}{'page'}{'revision'}{'timestamp'} eq $offset)
{
# do nothing
}
elsif (${$xml_data}{'page'}{'revision'}{'comment'})
{
($count++ && $total++) unless ($revisions{${$xml_data}{'page'}{'revision'}{'id'}});
$revisions{${$xml_data}{'page'}{'revision'}{'id'}} = ${$xml_data}{'page'}{'revision'};
}
else
{
foreach my $revision (sort {$a <=> $b} keys(%{${$xml_data}{'page'}{'revision'}}))
{
($count++ && $total++) unless ($revisions{$revision});
$revisions{$revision} = ${$xml_data}{'page'}{'revision'}{$revision};
}
}
warn "Got $count revisions\n";
if ($count == 100)
{
warn "Still more past $offset to get, waiting 5 seconds between hits\n";
sleep(5);
goto GETMORE;
}
if ($total > 0)
{
warn "Saving cache...\n";
open(OUT, '>'.$fname);
print OUT (Dumper(\%revisions));
close(OUT);
warn "done.\n";
}
return %revisions;
}