Benutzer:Svebert/HA Bash
Dieses Skript ist die erste Version eines bash-Skriptes um Hauptautoren zu ermitteln. Bislang ist es sehr rudimentär, aber funktionstüchtig.
Hier kommt der Code des Skriptes. Diesen Code einfach in eine leere Datei (mit dem z.B. dem Namen author.sh) kopieren. Bedienungsanweisungen finden sich als Kommentar im Skript.
Der Vorteil dieses Skripts gegenüber anderen (s. Hauptautoren), ist dass nichts extra installiert oder eingestellt werden muss. Dieses Skript ist auf einem normalen Linuxrechner lauffähig.
Achtung: Das Skript ist vom 17.06.2012 also sehr neu und hat daher sicherlich viele Macken und Fehler. Ich werde weiter daran arbeiten. Im Endeffekt soll das Skript eine grafische Wortwolke erzeugen. Aber bis dahin ists noch ein weiter Weg. Bitte meldet mir etwaige Fehler. Mir bekannte Fehler habe ich als Kommentar im Skript aufgeführt und werde diese bald beheben. Es ist ausdrücklich erwünscht den hier gezeigten Code weiterzuentwickeln.
Viel Spaß
#!/bin/bash
#####################################################################################
#Script to count the edits and edited bytes of users for one article of the wikipedia
##################Script by Svebert (06/2012)########################################
##################Version 0.6#######################################################
#####################################################################################
#Fill in the lemma of the article you want to inspect
LEMMA=Trägheitskraft
#Decide whether the output should be sorted by the EDIT_COUNT or the BYTES_EDITED
SORT_BY_COLUMN_PRIMARY=5 # 2 = EDIT_COUNT, 3 = ABS BYTES EDITED, 4 = BYTES EDITED , 5 = SCORE
SORT_BY_COLUMN_SECONDARY=2
#Aggregate IPs y or n, if y then all IPs are handeled as one user named @
#The script is much faster if AGG_IP is set to y
AGG_IP="y"
#That's it. Run the script on your linux terminal: source author.sh
#The result will be printed to the terminal, but you can pipe it to a file if you want to:
#source author.sh > result.txt
#The script downloads the version history of the given lemma and saves it to temporary xml-files
########################Idea of this script#########################################
#For all other main author programs listed at WP:Hauptautor you have to install something.
#I tried (and try) to write a script which runs without any extra programs on a "normal" linux machine
#The script downloads the verions history via wget and uses the Wiki-API. The downloaded
#files are xml-files.
#Then the script parses these xml-files via xpath and counts the edits and bytes
#The result is printed to stdout
########################Known Bugs and ToDos########################################
#*xpath prints 'Value: Query didn't return a nodeset.' to stderr for no(?) reason
#*Script is still slow especially for articles with a long version history
#*exclude IPs and/or bots
#############################Nothing to be edited further down from here############
#Delete arrays in case of the script was quit abnormally in the run before this one
unset USER_NAMES
unset CUM_BYTES
unset ABS_CUM_BYTES
unset EDIT_COUNT
REV_LIMIT=75 # rev limit maximum is 500
TMP_FILE='tmp.xml'
LEMMA=$(echo "$LEMMA" | sed "s/\s/\_/g") #replace space in LEMMA with underline, otherwise the URL doesnt work
WIKI_URL="http://de.wikipedia.org/w/api.php?action=query&prop=revisions&format=xml&rvprop=timestamp|user|size|comment&rvlimit=$REV_LIMIT&rvdir=older&titles=$LEMMA"
#Download Version history
echo "Starting download..." >&2
AGAIN=1
CONTINUE_TAG=""
RVSTARTID=""
file_no=0
USER_STR=""
USER_STR_T=""
#download until all revisions are fetched
while [ "$AGAIN" == 1 ]
do
wget -O $TMP_FILE$file_no $WIKI_URL$CONTINUE_TAG
#check if all versions where downloaded
RVSTARTID=$(echo $(xpath -q -e "/api/query-continue/revisions/@rvcontinue" $TMP_FILE$file_no) | sed "s/\"//g")
if [ "$RVSTARTID" == "" ];
then
AGAIN=0
CONTINUE_TAG=""
else
CONTINUE_TAG="&$RVSTARTID"
AGAIN=1
fi
((file_no++))
done
echo "Finished download ($file_no files)." >&2
#declare new arrays and helper functions
declare -a USER_NAMES
declare -a CUM_BYTES
declare -a ABS_CUM_BYTES
declare -a EDIT_COUNT
user_counter=0
edit_counter=0
abs () # Absolute value
{ # Uses global "value" variable.
if [ "$1" -lt 0 ] # If negative
then #+ then
let "value = 0 - $1" #+ change sign,
else #+ else
let "value = $1" #+ leave it alone.
fi
}
# Test an IP address for validity:
function valid_ip()
{
local ip=$1
local stat=1
if [[ $ip =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
OIFS=$IFS
IFS='.'
ip=($ip)
IFS=$OIFS
[[ ${ip[0]} -le 255 && ${ip[1]} -le 255 \
&& ${ip[2]} -le 255 && ${ip[3]} -le 255 ]]
stat=$?
fi
return $stat
}
loop_counter=0
total_revs=$(($file_no*$REV_LIMIT))
#loop over all files
for (( i=0; i<$file_no; i++ ))
do
#get number of entries in file
NO_REVS=$(xpath -q -e "count(//rev)" $TMP_FILE$i)
edit_counter=$(( edit_counter + $NO_REVS ))
#loop over all entries in file
for (( l=1; l<=$NO_REVS; l++ ))
do
#read in user name and revision size
exists="n"
CURRENT_USER=$(echo $(echo $(xpath -q -e "//rev[$l]/@user" $TMP_FILE$i) | sed "s/user=//g") | sed "s/\"//g")
if [ $AGG_IP == "y" ]
then
#check if CURRENT_USER is IP
if valid_ip $CURRENT_USER;
then
CURRENT_USER="@" #replace IP-Number by @ because its the only symbol not allowed in user names
fi
fi
CURRENT_BYTES=$(echo $( echo $(xpath -q -e "//rev[$l]/@size" $TMP_FILE$i) | sed "s/size=//g" ) | sed "s/\"//g")
#get revision size of one edit before this one to calculate the contributed bytes
if [ "$l" -lt "$NO_REVS" ]
then
ll=$(( l+1 ))
CURRENT_BYTES2=$(echo $( echo $(xpath -q -e "//rev[$ll]/@size" $TMP_FILE$i) | sed "s/size=//g" ) | sed "s/\"//g")
else
ii=$(( i+1 ))
if [ "$ii" -lt "$file_no" ]
then
CURRENT_BYTES2=$(echo $( echo $(xpath -q -e "//rev[1]/@size" $TMP_FILE$ii) | sed "s/size=//g" ) | sed "s/\"//g")
else
CURRENT_BYTES2=0
fi
fi
#calculate contributed bytes and its absolute value
CURRENT_CUM_BYTES=$(( CURRENT_BYTES-$CURRENT_BYTES2 ))
abs $CURRENT_CUM_BYTES
ABS_CURRENT_CUM_BYTES=$value
#check wether this user is already in the USER_NAMES array
if [ "$user_counter" -gt 0 ]
then
for (( k=$user_counter-1; k>=0; k-- ))
do
if [ "$CURRENT_USER" == "${USER_NAMES[k]}" ]
then
exists="y"
current_user_index=$k
break;
fi
done
else
exists="n"
fi
#either add CURRENT_USER as new user to the array or add only the values to the right index
#of the array
if [ $exists == "n" ]
then
current_user_index=$user_counter
USER_NAMES[current_user_index]=$CURRENT_USER
EDIT_COUNT[current_user_index]=1
CUM_BYTES[current_user_index]=$CURRENT_CUM_BYTES
ABS_CUM_BYTES[current_user_index]=$ABS_CURRENT_CUM_BYTES
echo "USER($loop_counter/$total_revs)=${USER_NAMES[current_user_index]}" >&2
((user_counter++))
else
echo "USER($loop_counter/$total_revs)+=${USER_NAMES[current_user_index]}" >&2
(( EDIT_COUNT[current_user_index]++ ))
CUM_BYTES[current_user_index]=$(( ${CUM_BYTES[current_user_index]} + $CURRENT_CUM_BYTES ))
ABS_CUM_BYTES[current_user_index]=$(( ${ABS_CUM_BYTES[current_user_index]} + $ABS_CURRENT_CUM_BYTES ))
fi
TOTAL_EDITED_BYTES=$(( TOTAL_EDITED_BYTES + $ABS_CURRENT_CUM_BYTES ))
((loop_counter++))
done
done
#Write data as text-table to screen and sort by SORT_BY_COLUMN
echo "Printing results..." >&2
TOTAL_EDITS=$edit_counter
UNIQUE_USERS=$user_counter
echo -e "#LEMMA=$LEMMA"
echo -e "#TOTAL_EDITS=$TOTAL_EDITS, #TOTAL_EDITED_BYTES=$TOTAL_EDITED_BYTES, #UNIQUE_USERS=$UNIQUE_USERS"
echo -e "#USER\t#EDITS\t#ABS_BYTES_EDITED\t#BYTES_EDITED\t#SCORE"
counter=0
for i in "${USER_NAMES[@]}"
do
tmp_usr_nm=$(echo "$i" | sed "s/\s/\_/g")
bytes=${ABS_CUM_BYTES[counter]}
edit_count=${EDIT_COUNT[counter]}
SCORE=$( echo -e "scale=4;$edit_count/$TOTAL_EDITS + ($edit_count-1)/$edit_count*$bytes/$TOTAL_EDITED_BYTES" | bc )
echo -e "$tmp_usr_nm\t${EDIT_COUNT[counter]}\t${ABS_CUM_BYTES[counter]}\t${CUM_BYTES[counter]}\t$SCORE"
((counter++))
done |
sort -rn -k $SORT_BY_COLUMN_PRIMARY -k $SORT_BY_COLUMN_SECONDARY | column -t
echo "done." >&2
#delete arrays
unset USER_ARR
unset USER_NAMES
unset CUM_BYTES
unset ABS_CUM_BYTES
unset EDIT_COUNT
#remove tmp files
for (( i=0;i<$file_no;i++ ))
do
rm $TMP_FILE$i
done