User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm: Difference between revisions
Appearance
Content deleted Content added
Updating published sources: General: * Have the bot script watch for changes and automatically re-exec itself. SourceUploader: * Change things around so task metadata is stored with the task. |
Updating published sources: General: * Wikipedia:Creating a bot#General guidelines for running a bot has been updated to state that the "10 reads per minute" limit only applies if maxlag is not used. Since AnomieBOT uses maxlag implicitly on all |
||
Line 70: | Line 70: | ||
$api->task('OrphanReferenceFixer'); |
$api->task('OrphanReferenceFixer'); |
||
$api->read_throttle( |
$api->read_throttle(); |
||
$api->edit_throttle(10); |
$api->edit_throttle(10); |
||
Line 91: | Line 91: | ||
gcmtitle => 'Category:Pages with incorrect ref formatting', |
gcmtitle => 'Category:Pages with incorrect ref formatting', |
||
gcmnamespace => '0', |
gcmnamespace => '0', |
||
# Using 10 instead of max until approved and initial run through the |
|||
# cat is completed, so we can process the most likely to be fixed |
|||
# sooner (remember, perl randomizes the order of hash values). |
|||
gcmlimit => '10', |
gcmlimit => '10', |
||
gcmsort => 'timestamp', |
gcmsort => 'timestamp', |
||
Line 222: | Line 225: | ||
prop => 'revisions', |
prop => 'revisions', |
||
rvprop => 'ids|timestamp|content', |
rvprop => 'ids|timestamp|content', |
||
# Using 1 instead of max because we're downloading the content |
|||
# of each revision |
|||
rvlimit => 1 |
rvlimit => 1 |
||
); |
); |
Revision as of 02:03, 25 August 2008
Pending approval Wikipedia:Bots/Requests for approval/AnomieBOT |
package tasks::OrphanReferenceFixer;
=pod
=begin metadata
Task: OrphanReferenceFixer
BRFA: Wikipedia:Bots/Requests for approval/AnomieBOT
Status: BRFA
Rate: Max 6 edits/minute
Applies the following corrections to pages in [[:Category:Pages with incorrect
ref formatting]]. This is often enough to get them removed from the category.
<div style="font-size:90%">
* <nowiki><ref …></ref> → <ref …/></nowiki>
* <nowiki><references …></references> → <references …/></nowiki>
* Remove <nowiki><ref …/></nowiki> without <code>name</code>
* Strip parameters other than <code>name</code> and <code>group</code> from <nowiki><ref> and <references></nowiki>
* Rename refs with numeric names
* Copy content for orphaned named refs from past page revisions
</div>
=end metadata
=cut
use strict;
use AnomieBOT::Task;
use POSIX qw/strftime/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;
use Storable qw/freeze thaw dclone/;
use Data::Dumper;
sub new {
my $class=shift;
my $self=$class->SUPER::new();
# "Skip" list is used to keep one long page from monopolizing the bot's
# time. It checks a page for a max of 10 minutes, and then skips it on
# subsequent runs until it has processed all other pages in the category.
$self->{'skip'}={};
# Used to determine when to scan the datastore for removing obsolete
# entries.
$self->{'lastcleanup'}=0;
bless $self, $class;
return $self;
}
=pod
=for notice
Pending approval<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT]]
=cut
sub approved {
return 0;
}
sub run {
my ($self, $api)=@_;
$api->task('OrphanReferenceFixer');
$api->read_throttle(0);
$api->edit_throttle(10);
if($self->{'lastcleanup'}+86400<time()){
# Cleanup obsolete entries in the data store
my @to_delete=();
my $exp=time()-86400*30;
while(my $k=$api->nextkey){
my $x=$api->fetch($k);
$api->delete($k) if $x->{'touched'}<$exp;
}
}
# Spend a max of 10 minutes on this task before restarting
my $endtime=time()+600;
# First, get the list of pages to check
my %q=(
generator => 'categorymembers',
gcmtitle => 'Category:Pages with incorrect ref formatting',
gcmnamespace => '0',
# Using 10 instead of max until approved and initial run through the
# cat is completed, so we can process the most likely to be fixed
# sooner (remember, perl randomizes the order of hash values).
gcmlimit => '10',
gcmsort => 'timestamp',
gcmdir => 'desc',
prop => 'info'
);
my $more=1;
while($more){
my %skip=%{$self->{'skip'}};
my $res=$api->query(%q);
if($res->{'code'} ne 'success'){
$self->warn("Failed to retrieve category list: ".$res->{'error'}."\n");
return 60;
}
if(exists($res->{'query-continue'})){
$q{'gcmstart'}=$res->{'query-continue'}{'categorymembers'}{'gcmstart'};
} elsif(keys %skip){
# We hit the end of the cat, empty the skip list and start over
$self->{'skip'}={};
delete $q{'gcmstart'};
} else {
# End of cat and nothing skipped, we must be done once we finish
# the pages in this last response.
$more=0;
}
# Process found pages
foreach (values %{$res->{'query'}{'pages'}}){
my $title=$_->{'title'};
$self->warn("Checking references in $title\n");
# WTF?
if(exists($_->{'missing'})){
$self->warn("$title is missing? WTF?\n");
next;
}
# Don't try fixing any page touched less than 5 minutes ago, to
# give the real editor a chance to fix it.
my $lastmod=$self->ISO2timestamp($_->{'touched'});
if(time()-$lastmod<300){
$self->warn("$title touched too recently, leave it for later\n");
next;
}
# In the skip list?
if(exists($skip{$_->{'pageid'}}) && $skip{$_->{'pageid'}} eq $_->{'lastrevid'}){
$self->warn("Skipping $title for now to let other pages get a chance\n");
next;
}
# Did we check this revision already?
my $checked=$api->fetch($_->{'pageid'});
if(!defined($checked)){
# No, never saw it before
$checked={
revid=>$_->{'lastrevid'},
continue=>$_->{'lastrevid'},
unfound=>[]
};
} elsif($checked->{'revid'} ne $_->{'lastrevid'}){
# Saw an old revision, rescan this new one
$checked->{'revid'}=$_->{'lastrevid'};
$checked->{'continue'}=$_->{'lastrevid'};
} elsif($checked->{'continue'} ne ''){
# In the middle of checking this revision
} else {
# Yes, we (supposedly) completed this one
$self->warn("Revision ".$_->{'lastrevid'}." of $title was already checked\n");
$checked->{'touched'}=time();
$api->store($_->{'pageid'}, $checked);
next;
}
# Ok, check the page
my $tok=$api->edittoken($title);
if($tok->{'code'} eq 'shutoff'){
$self->warn("Task disabled: ".$tok->{'content'}."\n");
return 300;
}
if($tok->{'code'} ne 'success'){
$self->warn("Failed to get edit token for $title: ".$tok->{'error'}."\n");
next;
}
next if exists($tok->{'missing'});
if($tok->{'lastrevid'} ne $checked->{'revid'}){
# Someone edited in between loading the cat and getting the
# token. We'll catch the new revision next time around.
$self->warn("$title was edited since cat list was loaded, abort\n");
next;
}
# Get page text, and strip out <nowiki>s
my $intxt=$tok->{'revisions'}[0]{'*'};
my ($outtxt,$nowiki)=$self->strip_nowiki($intxt);
# First, fix obvious errors.
$outtxt=~s!<ref((?:\s*[^>]*)?)></ref>!<ref$1/>!oig;
$outtxt=~s!<ref\s*/>!!oig;
$outtxt=~s!<references((?:\s+[^>]*)?)>.*?</references>!<references$1/>!oigs;
$outtxt=~s!(<references)((?:\s+[^>]+)?)/>! $1._filter_params($2,'group').'/>' !oige;
# Find references currently in the article, and build list of
# replacements to be applied.
my @replacements=();
my %refs=$self->_get_refs($outtxt, \@replacements);
# Any orphaned refs?
my @unfound=@{$checked->{'unfound'}};
my %needed=();
while(my ($g,$refs)=each(%refs)){
while(my ($n,$v)=each(%$refs)){
my $x=freeze([$g,$n]);
if(exists($refs{$g}{$n}{'broken'})){
# Broken ref (contains "<ref"), just completely ignore
# it.
} elsif($v->{'type'} eq ''){
# Orphan found, mark as needed unless known to be
# unfindable.
$needed{$x}=$v->{'orig'}[0] unless grep { $_ eq $x } @unfound;
} else {
# Check if someone added a previously unfound ref
@unfound=grep { $_ ne $x } @unfound;
}
}
}
# Setup for checking unfound refs
my %rq=(
pageids => $_->{'pageid'},
prop => 'revisions',
rvprop => 'ids|timestamp|content',
# Using 1 instead of max because we're downloading the content
# of each revision
rvlimit => 1
);
if($checked->{'continue'} eq $checked->{'revid'}){
# Don't bother getting content for the latest revision
$rq{'rvprop'}='ids|timestamp';
}
my @found=();
my $needed=scalar keys %needed;
while($needed>0 && $checked->{'continue'} ne ''){
# We found some orphaned refs. Now we have to start going back
# through the history to try to find the original text...
$rq{'rvstartid'}=$checked->{'continue'};
my $rres=$api->query(%rq);
if($rres->{'code'} ne 'success'){
$self->warn("Failed to retrieve revision for $title: ".$rres->{'error'}."\n");
last;
}
if(exists($rres->{'query-continue'})){
$checked->{'continue'}=$rres->{'query-continue'}{'revisions'}{'rvstartid'};
$rq{'rvprop'}='ids|timestamp|content';
} else {
$checked->{'continue'}='';
}
my $r=$rres->{'query'}{'pages'}{$_->{'pageid'}}{'revisions'}[0];
next if($r->{'revid'} eq $checked->{'revid'});
# Get refs from this past revision, and see if any of them are
# the ones we need.
my %rrefs=$self->_get_refs($r->{'*'});
foreach (keys %needed){
my ($g,$n)=@{thaw($_)};
next if !exists($rrefs{$g}{$n});
next if $rrefs{$g}{$n}{'type'} eq '';
push @replacements, {
'orig' => $needed{$_},
'repl' => $rrefs{$g}{$n}{'repl'}
};
push @found, "\"$n\" from rev ".$r->{'revid'};
delete $needed{$_};
$needed--;
}
# If we've been at it long enough, exit the loop to give
# another page a chance.
last if time()>=$endtime;
}
# If we found all orphans, no need to continue next time.
$checked->{'continue'}='' if $needed==0;
# Process the list of replacements now.
foreach (@replacements){
my $i=index($outtxt, $_->{'orig'});
substr($outtxt, $i, length($_->{'orig'}))=$_->{'repl'} if $i>0;
}
# Done processing, put back the <nowiki>s now
$outtxt=$self->replace_nowiki($outtxt, $nowiki);
# Need to edit?
if($outtxt ne $intxt){
my $summary='Fixing reference errors';
$summary.=' and rescuing orphaned refs ('.join('; ', @found).')' if @found;
$self->warn("$summary in $title\n");
$summary='Fixing reference errors and rescuing orphaned refs (too many to list)' if length($summary)>255;
my $r=$api->edit($tok, $outtxt, $summary, 0, 1);
if($r->{'code'} ne 'success'){
$self->warn("Write failed on $title: ".$r->{'error'}."\n");
next;
}
} else {
$self->warn("Nothing I can fix in $title\n");
}
# If we're not continuing next time, any refs that are still needed
# are not in the article history at all. Record them so we don't
# bother searching the whole history again next time someone edits
# the page.
if($checked->{'continue'} eq ''){
foreach (keys %needed){
push @unfound, $_;
}
$checked->{'unfound'}=\@unfound;
$self->warn("Completed scanning $title revision ".$_->{'lastrevid'}."\n");
} else {
# If we are continuing, add the page to the "skip" list to let
# other pages have a chance to be scanned.
$self->warn("$title will be continued later\n");
$self->{'skip'}{$_->{'pageid'}}=$_->{'lastrevid'} if($checked->{'continue'} ne '');
}
# Ok, we successfully processed the page. Save the persistant data
# now.
$checked->{'title'}=$title; # for manual db editing
$checked->{'touched'}=time();
$api->store($_->{'pageid'}, $checked);
# If we've been at it long enough, let another task have a go.
return 0 if time()>=$endtime;
}
}
# No more pages to check, try again in 10 minutes or so.
return 600;
}
# Subroutine to filter a parameter list to contain only specified parameters.
sub _filter_params {
my $in=shift;
my $out='';
while(my $p=shift){
$out.=$1 if($in=~/(\s+$p=(?:"[^\x22]*"|'[^\x27]*'|\S*))/oi);
}
$out.=$1 if($in=~/(\s+$)/oi);
$out=$in if length($in)==length($out);
return $out;
}
# Subroutine to get all the references in an article.
sub _get_refs {
my $self=shift;
my ($text,$nowiki)=$self->strip_nowiki(shift);
my $replacements=shift; $replacements=[] unless defined($replacements);
my %refs=();
# Find all ref tags
my @matches=($text=~m!(<ref((?:\s+[^\s=]+\s*=\s*(?:"[^\x22]*"|'[^\x27]*'|[^\x22\x27\s]*?))*)\s*(?:/>|>(.*?)</ref>))!oigs);
for(my $i=0; $i<@matches; $i+=3){
# Group?
my ($gg,$g);
if($matches[$i+1]=~/(\s+group\s*=\s*"([^\x22]+)")/oi ||
$matches[$i+1]=~/(\s+group\s*=\s*'([^\x27]+)')/oi ||
$matches[$i+1]=~/(\s+group\s*=\s*([^\x22\x27\s]+))/oi){
$gg=$1; $g=$2;
} else {
$gg=''; $g='';
}
# Name?
my ($nn, $n);
if($matches[$i+1]=~/(\s+name\s*=\s*"([^\x22]+)")/oi ||
$matches[$i+1]=~/(\s+name\s*=\s*'([^\x27]+)')/oi ||
$matches[$i+1]=~/(\s+name\s*=\s*([^\x22\x27\s]+))/oi){
$nn=$1; $n=$2;
} else {
# We're not interested if it's unnamed. But strip it out if it's
# unnamed and empty, because that's an error.
if(!defined($matches[$i+2])){
push @$replacements, {
'orig' => $self->replace_nowiki($matches[$i],$nowiki),
'repl' => ''
};
}
next;
}
# Any parameters besides "name" and "group" cause errors, so replace
# them if found.
if($matches[$i+1] ne $nn.$gg && $matches[$i+1] ne $gg.$nn){
my $old=$matches[$i];
$matches[$i+1]=$gg.$nn;
$matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
push @$replacements, {
'orig' => $self->replace_nowiki($old,$nowiki),
'repl' => $self->replace_nowiki($matches[$i],$nowiki)
};
}
# Integer names cause errors, so replace them if found.
if($n=~/^\d+$/){
my $x="renamed_from_".$n."_on_".strftime('%Y%m%d%H%M%S', gmtime);
next if index($text, $x)>=0;
my $old=$matches[$i];
$matches[$i+1]=~s/name\s*=\s*([\x22\x27]?)$n\1/name=$1$x$1/i;
$matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
push @$replacements, {
'orig' => $self->replace_nowiki($old,$nowiki),
'repl' => $self->replace_nowiki($matches[$i],$nowiki)
};
$n=$x;
}
# Save detected reference
$refs{$g}={} unless exists($refs{$g});
if(!exists($refs{$g}{$n})){
$refs{$g}{$n}={
orig => [],
type => '',
content => undef
};
}
push @{$refs{$g}{$n}{'orig'}}, $self->replace_nowiki($matches[$i+0],$nowiki);
if(defined($matches[$i+2]) && $matches[$i+2]=~/<ref[\s>]/){
# Reference contains "<ref", so probably someone forgot a </ref>
# somewhere (and then that's probably how it got "orphaned"). To be
# safe, don't use it.
$matches[$i+2]=undef;
$refs{$g}{$n}{'broken'}=1;
}
if(defined($matches[$i+2]) && $matches[$i+2]=~/^\s*$/){
# Apparently, some people really do this. Don't use empty refs.
$matches[$i+2]=undef;
}
if($refs{$g}{$n}{'type'} eq '' && defined($matches[$i+2])){
$refs{$g}{$n}{'type'}='ref';
$refs{$g}{$n}{'repl'}=$self->replace_nowiki($matches[$i+0],$nowiki);
}
}
# Any #tag references? Return now if not.
return %refs unless $text=~/\x7b\x7b\s*#tag:\s*ref\s*[|\x7d]/;
# Darn. Now we have to parse through the page and find all the #tag:refs
# too.
$self->process_templates($text, sub {
my $name=shift;
my @params=@{shift()};
my $orig=$self->replace_nowiki(shift,$nowiki);
return undef if $name=~/^\s*#tag:\s*ref\s*$/is;
my $g='';
my $n=undef;
my $bad=0;
my $c=$self->replace_nowiki(shift(@params),$nowiki);
foreach (@params){
if(/^\s*group\s*=\s*([\x22\x27]?)([^\x22\x27]+?)\1\s*$/oi){
$g=$self->replace_nowiki($2,$nowiki);
} elsif(/^\s*name\s*=\s*([\x22\x27]?)([^\x22\x27]+?)\1\s*$/oi){
$n=$self->replace_nowiki($2,$nowiki);
} else {
$bad=1;
}
}
if(!defined($n)){
# We're not interested if it's unnamed. But strip it out if
# it's unnamed and empty, because that's an error.
if($c eq ''){
push @$replacements, {
'orig' => $orig,
'repl' => ''
};
}
return undef;
}
# If it had unrecognized parameters to the tag, strip them
if($bad){
my $old=$orig;
$orig="\x7b\x7b#tag:ref|$c";
$orig.="|name=$n" if defined($n);
$orig.="|group=$g" if $g ne '';
$orig.="\x7d\x7d";
push @$replacements, {
'orig' => $old,
'repl' => $orig
};
}
# Integer names cause errors, so replace them if found.
if($n=~/^\d+$/){
my $x="renamed_from_".$n."_on_".strftime('%Y%m%d%H%M%S', gmtime);
next if index($text, $x)>=0;
my $old=$orig;
$orig="\x7b\x7b#tag:ref|$c|name=$x";
$orig.="|group=$g" if $g ne '';
$orig.="\x7d\x7d";
$n=$x;
push @$replacements, {
'orig' => $old,
'repl' => $orig
};
}
# Save detected reference
$refs{$g}={} unless exists($refs{$g});
if(!exists($refs{$g}{$n})){
$refs{$g}{$n}={
orig => [],
type => '',
content => undef
};
}
if($c=~/^\s*$/){
# Apparently, some people really do this. Don't use empty refs.
$c='';
}
push @{$refs{$g}{$n}{'orig'}}, $orig;
if($refs{$g}{$n}{'type'} eq '' && $c ne ''){
$refs{$g}{$n}{'type'}='tag';
$refs{$g}{$n}{'repl'}=$orig;
}
return undef;
});
return %refs;
}
1;