User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm: Difference between revisions

Content deleted Content added
Inline
Revision as of 02:03, 25 August 2008

Pending approval
Wikipedia:Bots/Requests for approval/AnomieBOT
package tasks::OrphanReferenceFixer;

=pod

=begin metadata

Task:   OrphanReferenceFixer
BRFA:   Wikipedia:Bots/Requests for approval/AnomieBOT
Status: BRFA
Rate:   Max 6 edits/minute

Applies the following corrections to pages in [[:Category:Pages with incorrect
ref formatting]]. This is often enough to get them removed from the category.
<div style="font-size:90%">
* <nowiki><ref&nbsp;…></ref> → <ref&nbsp;…/></nowiki>
* <nowiki><references&nbsp;…></references> → <references&nbsp;…/></nowiki>
* Remove <nowiki><ref&nbsp;…/></nowiki> without <code>name</code>
* Strip parameters other than <code>name</code> and <code>group</code> from <nowiki><ref> and <references></nowiki>
* Rename refs with numeric names
* Copy content for orphaned named refs from past page revisions
</div>

=end metadata

=cut

use strict;

use AnomieBOT::Task;
use POSIX qw/strftime/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

use Storable qw/freeze thaw dclone/;
use Data::Dumper;

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();

    # "Skip" list is used to keep one long page from monopolizing the bot's
    # time. It checks a page for a max of 10 minutes, and then skips it on
    # subsequent runs until it has processed all other pages in the category.
    $self->{'skip'}={};

    # Used to determine when to scan the datastore for removing obsolete
    # entries.
    $self->{'lastcleanup'}=0;

    bless $self, $class;
    return $self;
}


=pod

=for notice 
Pending approval<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT]]

=cut

sub approved {
    return 0;
}

sub run {
    my ($self, $api)=@_;

    $api->task('OrphanReferenceFixer');
    $api->read_throttle(0);
    $api->edit_throttle(10);

    if($self->{'lastcleanup'}+86400<time()){
        # Cleanup obsolete entries in the data store
        my @to_delete=();
        my $exp=time()-86400*30;
        while(my $k=$api->nextkey){
            my $x=$api->fetch($k);
            $api->delete($k) if $x->{'touched'}<$exp;
        }
    }

    # Spend a max of 10 minutes on this task before restarting
    my $endtime=time()+600;

    # First, get the list of pages to check
    my %q=(
        generator    => 'categorymembers',
        gcmtitle     => 'Category:Pages with incorrect ref formatting',
        gcmnamespace => '0',
        # Using 10 instead of max until approved and initial run through the
        # cat is completed, so we can process the most likely to be fixed
        # sooner (remember, perl randomizes the order of hash values).
        gcmlimit     => '10',
        gcmsort      => 'timestamp',
        gcmdir       => 'desc',
        prop         => 'info'
    );
    my $more=1;
    while($more){
        my %skip=%{$self->{'skip'}};
        my $res=$api->query(%q);
        if($res->{'code'} ne 'success'){
            $self->warn("Failed to retrieve category list: ".$res->{'error'}."\n");
            return 60;
        }
        if(exists($res->{'query-continue'})){
            $q{'gcmstart'}=$res->{'query-continue'}{'categorymembers'}{'gcmstart'};
        } elsif(keys %skip){
            # We hit the end of the cat, empty the skip list and start over
            $self->{'skip'}={};
            delete $q{'gcmstart'};
        } else {
            # End of cat and nothing skipped, we must be done once we finish
            # the pages in this last response.
            $more=0;
        }

        # Process found pages
        foreach (values %{$res->{'query'}{'pages'}}){
            my $title=$_->{'title'};
            $self->warn("Checking references in $title\n");

            # WTF?
            if(exists($_->{'missing'})){
                $self->warn("$title is missing? WTF?\n");
                next;
            }

            # Don't try fixing any page touched less than 5 minutes ago, to
            # give the real editor a chance to fix it.
            my $lastmod=$self->ISO2timestamp($_->{'touched'});
            if(time()-$lastmod<300){
                $self->warn("$title touched too recently, leave it for later\n");
                next;
            }

            # In the skip list?
            if(exists($skip{$_->{'pageid'}}) && $skip{$_->{'pageid'}} eq $_->{'lastrevid'}){
                $self->warn("Skipping $title for now to let other pages get a chance\n");
                next;
            }

            # Did we check this revision already?
            my $checked=$api->fetch($_->{'pageid'});
            if(!defined($checked)){
                # No, never saw it before
                $checked={
                    revid=>$_->{'lastrevid'},
                    continue=>$_->{'lastrevid'},
                    unfound=>[]
                };
            } elsif($checked->{'revid'} ne $_->{'lastrevid'}){
                # Saw an old revision, rescan this new one
                $checked->{'revid'}=$_->{'lastrevid'};
                $checked->{'continue'}=$_->{'lastrevid'};
            } elsif($checked->{'continue'} ne ''){
                # In the middle of checking this revision
            } else {
                # Yes, we (supposedly) completed this one
                $self->warn("Revision ".$_->{'lastrevid'}." of $title was already checked\n");
                $checked->{'touched'}=time();
                $api->store($_->{'pageid'}, $checked);
                next;
            }

            # Ok, check the page
            my $tok=$api->edittoken($title);
            if($tok->{'code'} eq 'shutoff'){
                $self->warn("Task disabled: ".$tok->{'content'}."\n");
                return 300;
            }
            if($tok->{'code'} ne 'success'){
                $self->warn("Failed to get edit token for $title: ".$tok->{'error'}."\n");
                next;
            }
            next if exists($tok->{'missing'});
            if($tok->{'lastrevid'} ne $checked->{'revid'}){
                # Someone edited in between loading the cat and getting the
                # token. We'll catch the new revision next time around.
                $self->warn("$title was edited since cat list was loaded, abort\n");
                next;
            }

            # Get page text, and strip out <nowiki>s
            my $intxt=$tok->{'revisions'}[0]{'*'};
            my ($outtxt,$nowiki)=$self->strip_nowiki($intxt);

            # First, fix obvious errors.
            $outtxt=~s!<ref((?:\s*[^>]*)?)></ref>!<ref$1/>!oig;
            $outtxt=~s!<ref\s*/>!!oig;
            $outtxt=~s!<references((?:\s+[^>]*)?)>.*?</references>!<references$1/>!oigs;
            $outtxt=~s!(<references)((?:\s+[^>]+)?)/>! $1._filter_params($2,'group').'/>' !oige;

            # Find references currently in the article, and build list of
            # replacements to be applied.
            my @replacements=();
            my %refs=$self->_get_refs($outtxt, \@replacements);

            # Any orphaned refs?
            my @unfound=@{$checked->{'unfound'}};
            my %needed=();
            while(my ($g,$refs)=each(%refs)){
                while(my ($n,$v)=each(%$refs)){
                    my $x=freeze([$g,$n]);
                    if(exists($refs{$g}{$n}{'broken'})){
                        # Broken ref (contains "<ref"), just completely ignore
                        # it.
                    } elsif($v->{'type'} eq ''){
                        # Orphan found, mark as needed unless known to be
                        # unfindable.
                        $needed{$x}=$v->{'orig'}[0] unless grep { $_ eq $x } @unfound;
                    } else {
                        # Check if someone added a previously unfound ref
                        @unfound=grep { $_ ne $x } @unfound;
                    }
                }
            }

            # Setup for checking unfound refs
            my %rq=(
                pageids => $_->{'pageid'},
                prop      => 'revisions',
                rvprop    => 'ids|timestamp|content',
                # Using 1 instead of max because we're downloading the content
                # of each revision
                rvlimit   => 1
            );
            if($checked->{'continue'} eq $checked->{'revid'}){
                # Don't bother getting content for the latest revision
                $rq{'rvprop'}='ids|timestamp';
            }
            my @found=();
            my $needed=scalar keys %needed;
            while($needed>0 && $checked->{'continue'} ne ''){
                # We found some orphaned refs. Now we have to start going back
                # through the history to try to find the original text...
                $rq{'rvstartid'}=$checked->{'continue'};
                my $rres=$api->query(%rq);
                if($rres->{'code'} ne 'success'){
                    $self->warn("Failed to retrieve revision for $title: ".$rres->{'error'}."\n");
                    last;
                }
                if(exists($rres->{'query-continue'})){
                    $checked->{'continue'}=$rres->{'query-continue'}{'revisions'}{'rvstartid'};
                    $rq{'rvprop'}='ids|timestamp|content';
                } else {
                    $checked->{'continue'}='';
                }
                my $r=$rres->{'query'}{'pages'}{$_->{'pageid'}}{'revisions'}[0];
                next if($r->{'revid'} eq $checked->{'revid'});

                # Get refs from this past revision, and see if any of them are
                # the ones we need.
                my %rrefs=$self->_get_refs($r->{'*'});
                foreach (keys %needed){
                    my ($g,$n)=@{thaw($_)};
                    next if !exists($rrefs{$g}{$n});
                    next if $rrefs{$g}{$n}{'type'} eq '';
                    push @replacements, {
                        'orig' => $needed{$_},
                        'repl' => $rrefs{$g}{$n}{'repl'}
                    };
                    push @found, "\"$n\" from rev ".$r->{'revid'};
                    delete $needed{$_};
                    $needed--;
                }

                # If we've been at it long enough, exit the loop to give
                # another page a chance.
                last if time()>=$endtime;
            }
            # If we found all orphans, no need to continue next time.
            $checked->{'continue'}='' if $needed==0;

            # Process the list of replacements now.
            foreach (@replacements){
                my $i=index($outtxt, $_->{'orig'});
                substr($outtxt, $i, length($_->{'orig'}))=$_->{'repl'} if $i>0;
            }

            # Done processing, put back the <nowiki>s now
            $outtxt=$self->replace_nowiki($outtxt, $nowiki);

            # Need to edit?
            if($outtxt ne $intxt){
                my $summary='Fixing reference errors';
                $summary.=' and rescuing orphaned refs ('.join('; ', @found).')' if @found;
                $self->warn("$summary in $title\n");
                $summary='Fixing reference errors and rescuing orphaned refs (too many to list)' if length($summary)>255;
                my $r=$api->edit($tok, $outtxt, $summary, 0, 1);
                if($r->{'code'} ne 'success'){
                    $self->warn("Write failed on $title: ".$r->{'error'}."\n");
                    next;
                }
            } else {
                $self->warn("Nothing I can fix in $title\n");
            }

            # If we're not continuing next time, any refs that are still needed
            # are not in the article history at all. Record them so we don't
            # bother searching the whole history again next time someone edits
            # the page.
            if($checked->{'continue'} eq ''){
                foreach (keys %needed){
                    push @unfound, $_;
                }
                $checked->{'unfound'}=\@unfound;
                $self->warn("Completed scanning $title revision ".$_->{'lastrevid'}."\n");
            } else {
                # If we are continuing, add the page to the "skip" list to let
                # other pages have a chance to be scanned.
                $self->warn("$title will be continued later\n");
                $self->{'skip'}{$_->{'pageid'}}=$_->{'lastrevid'} if($checked->{'continue'} ne '');
            }

            # Ok, we successfully processed the page. Save the persistant data
            # now.
            $checked->{'title'}=$title; # for manual db editing
            $checked->{'touched'}=time();
            $api->store($_->{'pageid'}, $checked);

            # If we've been at it long enough, let another task have a go.
            return 0 if time()>=$endtime;
        }
    }

    # No more pages to check, try again in 10 minutes or so.
    return 600;
}

# Subroutine to filter a parameter list to contain only specified parameters.
sub _filter_params {
    my $in=shift;
    my $out='';

    while(my $p=shift){
        $out.=$1 if($in=~/(\s+$p=(?:"[^\x22]*"|'[^\x27]*'|\S*))/oi);
    }
    $out.=$1 if($in=~/(\s+$)/oi);
    $out=$in if length($in)==length($out);
    return $out;
}

# Subroutine to get all the references in an article.
sub _get_refs {
    my $self=shift;
    my ($text,$nowiki)=$self->strip_nowiki(shift);
    my $replacements=shift; $replacements=[] unless defined($replacements);
    my %refs=();

    # Find all ref tags
    my @matches=($text=~m!(<ref((?:\s+[^\s=]+\s*=\s*(?:"[^\x22]*"|'[^\x27]*'|[^\x22\x27\s]*?))*)\s*(?:/>|>(.*?)</ref>))!oigs);
    for(my $i=0; $i<@matches; $i+=3){
        # Group?
        my ($gg,$g);
        if($matches[$i+1]=~/(\s+group\s*=\s*"([^\x22]+)")/oi ||
           $matches[$i+1]=~/(\s+group\s*=\s*'([^\x27]+)')/oi ||
           $matches[$i+1]=~/(\s+group\s*=\s*([^\x22\x27\s]+))/oi){
            $gg=$1; $g=$2;
        } else {
            $gg=''; $g='';
        }

        # Name?
        my ($nn, $n);
        if($matches[$i+1]=~/(\s+name\s*=\s*"([^\x22]+)")/oi ||
           $matches[$i+1]=~/(\s+name\s*=\s*'([^\x27]+)')/oi ||
           $matches[$i+1]=~/(\s+name\s*=\s*([^\x22\x27\s]+))/oi){
            $nn=$1; $n=$2; 
        } else {
            # We're not interested if it's unnamed. But strip it out if it's
            # unnamed and empty, because that's an error.
            if(!defined($matches[$i+2])){
                push @$replacements, {
                    'orig' => $self->replace_nowiki($matches[$i],$nowiki),
                    'repl' => ''
                };
            }
            next;
        }

        # Any parameters besides "name" and "group" cause errors, so replace
        # them if found.
        if($matches[$i+1] ne $nn.$gg && $matches[$i+1] ne $gg.$nn){
            my $old=$matches[$i];
            $matches[$i+1]=$gg.$nn;
            $matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
            push @$replacements, {
                'orig' => $self->replace_nowiki($old,$nowiki),
                'repl' => $self->replace_nowiki($matches[$i],$nowiki)
            };
        }

        # Integer names cause errors, so replace them if found.
        if($n=~/^\d+$/){
            my $x="renamed_from_".$n."_on_".strftime('%Y%m%d%H%M%S', gmtime);
            next if index($text, $x)>=0;
            my $old=$matches[$i];
            $matches[$i+1]=~s/name\s*=\s*([\x22\x27]?)$n\1/name=$1$x$1/i;
            $matches[$i]='<ref'.$matches[$i+1].(defined($matches[$i+2])?'>'.$matches[$i+2].'</ref>':'/>');
            push @$replacements, {
                'orig' => $self->replace_nowiki($old,$nowiki),
                'repl' => $self->replace_nowiki($matches[$i],$nowiki)
            };
            $n=$x;
        }

        # Save detected reference
        $refs{$g}={} unless exists($refs{$g});
        if(!exists($refs{$g}{$n})){
            $refs{$g}{$n}={
                orig => [],
                type => '',
                content => undef
            };
        }
        push @{$refs{$g}{$n}{'orig'}}, $self->replace_nowiki($matches[$i+0],$nowiki);
        if(defined($matches[$i+2]) && $matches[$i+2]=~/<ref[\s>]/){
            # Reference contains "<ref", so probably someone forgot a </ref>
            # somewhere (and then that's probably how it got "orphaned"). To be
            # safe, don't use it.
            $matches[$i+2]=undef;
            $refs{$g}{$n}{'broken'}=1;
        }
        if(defined($matches[$i+2]) && $matches[$i+2]=~/^\s*$/){
            # Apparently, some people really do this. Don't use empty refs.
            $matches[$i+2]=undef;
        }
        if($refs{$g}{$n}{'type'} eq '' && defined($matches[$i+2])){
            $refs{$g}{$n}{'type'}='ref';
            $refs{$g}{$n}{'repl'}=$self->replace_nowiki($matches[$i+0],$nowiki);
        }
    }

    # Any #tag references? Return now if not.
    return %refs unless $text=~/\x7b\x7b\s*#tag:\s*ref\s*[|\x7d]/;

    # Darn. Now we have to parse through the page and find all the #tag:refs
    # too.
    $self->process_templates($text, sub {
        my $name=shift;
        my @params=@{shift()};
        my $orig=$self->replace_nowiki(shift,$nowiki);

        return undef if $name=~/^\s*#tag:\s*ref\s*$/is;
        my $g='';
        my $n=undef;
        my $bad=0;
        my $c=$self->replace_nowiki(shift(@params),$nowiki);
        foreach (@params){
            if(/^\s*group\s*=\s*([\x22\x27]?)([^\x22\x27]+?)\1\s*$/oi){
                $g=$self->replace_nowiki($2,$nowiki);
            } elsif(/^\s*name\s*=\s*([\x22\x27]?)([^\x22\x27]+?)\1\s*$/oi){
                $n=$self->replace_nowiki($2,$nowiki);
            } else {
                $bad=1;
            }
        }

        if(!defined($n)){
            # We're not interested if it's unnamed. But strip it out if
            # it's unnamed and empty, because that's an error.
            if($c eq ''){
                push @$replacements, {
                    'orig' => $orig,
                    'repl' => ''
                };
            }
            return undef;
        }

        # If it had unrecognized parameters to the tag, strip them
        if($bad){
            my $old=$orig;
            $orig="\x7b\x7b#tag:ref|$c";
            $orig.="|name=$n" if defined($n);
            $orig.="|group=$g" if $g ne '';
            $orig.="\x7d\x7d";
            push @$replacements, {
                'orig' => $old,
                'repl' => $orig
            };
        }

        # Integer names cause errors, so replace them if found.
        if($n=~/^\d+$/){
            my $x="renamed_from_".$n."_on_".strftime('%Y%m%d%H%M%S', gmtime);
            next if index($text, $x)>=0;
            my $old=$orig;
            $orig="\x7b\x7b#tag:ref|$c|name=$x";
            $orig.="|group=$g" if $g ne '';
            $orig.="\x7d\x7d";
            $n=$x;
            push @$replacements, {
                'orig' => $old,
                'repl' => $orig
            };
        }

        # Save detected reference
        $refs{$g}={} unless exists($refs{$g});
        if(!exists($refs{$g}{$n})){
            $refs{$g}{$n}={
                orig => [],
                type => '',
                content => undef
            };
        }
        if($c=~/^\s*$/){
            # Apparently, some people really do this. Don't use empty refs.
            $c='';
        }
        push @{$refs{$g}{$n}{'orig'}}, $orig;
        if($refs{$g}{$n}{'type'} eq '' && $c ne ''){
            $refs{$g}{$n}{'type'}='tag';
            $refs{$g}{$n}{'repl'}=$orig;
        }

        return undef;
    });

    return %refs;
}

1;