User:Sz-iwbot/tineyecode
Jump to navigation
Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This script check new images copyright at commons, with tineye.
"""
#
# (C) Shizhao, 2009
#
# Distributed under the terms of the MIT license.
#
#
import re,sys,wikipedia,urllib,urllib2, datetime,time,query,string
mysite = wikipedia.getSite()
def FormatPage(html):
"""format webpage"""
RAW_BEGIN = '<title>'
RAW_END = '<link'
iStart = html.find(RAW_BEGIN)
iEnd = html.find(RAW_END)
if iStart >= 0 and iEnd >= 0:
html = html[iStart:iEnd]
else:
raise "not found CONTENTS"
# one line
def oneline(matchobj):
return matchobj.group(0).replace('\n', '')
html = re.sub(r'(?ms)<title>.*?</title>', oneline, html)
# remove spaces
html = re.sub(r'\n[ \t]+', r'\n', html)
return html
#Check at tineye
def bot(tineyetext, imagepage):
try:
results = re.compile(r'^<title>(\d+)\D+</title>')
resultsnumber = int(results.search(tineyetext).groups()[0])
if resultsnumber == 0:
wikipedia.output(u'[[%s]] seems ok, similar images not found in tineye.' % imagepage.title())
elif resultsnumber > 0:
if "Similarimages" not in imagepage.templates():
page=wikipedia.Page(mysite,u'User:Sz-iwbot/tineye')
text=page.get()
text=text+u"\n{{User:Sz-iwbot/sd|image= "+imagepage.title()+u" |n= " + str(resultsnumber) +u" |url= " + tineyeurl + " }}"
wikipedia.output(u'[[%s]] found %s similar images in tineye.' % (imagepage.title(), resultsnumber))
page.put(text, u"Bot: Found similar images in tineye.")
text=""
# imagetext = imagepage.get()
# imagetext = imagetext + '\n{{Similarimages|n=' + str(resultsnumber) + u'|url=' + tineyeurl + u'}}'
# wikipedia.output(u'Tag image...')
# imagepage.put(imagetext, u"Bot: Tag template. Found similar images in tineye.")
# imagetext = ""
else:
wikipedia.output(u'[[%s]] have tag on image. pass...' % imagepage.title())
except AttributeError:
wikipedia.output(u'ERROR: can\'t fetching images [[%s]]' % imagepage.title())
wikipedia.output(u'...Sleep, waiting 3 s...\n')
time.sleep(3)
seen = set()
while True:
if seen == set():
wikipedia.output(u'>>>Loading the new images<<<\n')
else:
print '----- Current time:', datetime.datetime.now()
print '>>>Waiting load new images<<<\n'
# time.sleep(60)
# clean page
texttemp=""
pagetemp = wikipedia.Page(mysite,u'User:Sz-iwbot/tineye')
texttemp = pagetemp.get()
params = {
'action' :'query',
'prop' :'images',
'titles' :'User:Sz-iwbot/tineye',
'imlimit' :'5000',
}
imagedata = query.GetData(params, useAPI = True, encodeTitle = False)
imagegroup = imagedata['query']['pages']['6608033']['images']
for imagetitle in imagegroup:
image = wikipedia.Page(mysite,imagetitle['title'])
rimage=string.replace(imagetitle['title'], '(', '\(')
rimage=string.replace(rimage, ')', '\)')
rimage=string.replace(rimage, '+', '\+')
if not image.exists():
r= re.compile(r'\n\{\{User:Sz-iwbot\/sd\|image= %s \|n= .*? \}\}' % rimage)
texttemp=wikipedia.replaceExcept(texttemp, r, '', exceptions='')
wikipedia.output(u'remove [[%s]], it have not exit.' % imagetitle['title'])
# elif "Similarimages" not in image.templates():
# imagetext = image.get()
# imagetext = imagetext + '\n{{Similarimages}}'
# image.put(imagetext, u"Bot: undo Similarimages template. plese check.")
# wikipedia.output(u'tag template on [[%s]], plese check.' % imagetitle['title'])
# imagetext = ""
# elif image.get().find('{{Similarimages|checked}}') > -1 or image.get().find('{{Similarimages|check}}') > -1:
# r= re.compile(r'\n\{\{User:Sz-iwbot\/sd\|image= %s \|n= .*? \}\}' % rimage)
# texttemp=wikipedia.replaceExcept(texttemp, r, '', exceptions='')
# wikipedia.output(u'remove [[%s]], it have checked.' % imagetitle['title'])
rb = re.compile(r'\n\{\{User:Sz-iwbot\/sd\|image= \|n=.*?\}\}')
texttemp=wikipedia.replaceExcept(texttemp, rb, '', exceptions='')
wikipedia.output(u'cleaning...')
if texttemp == pagetemp.get():
wikipedia.output(u'>>>Page not change, not clean!<<<\n')
else:
pagetemp.put(texttemp, 'clean up')
wikipedia.output(u'>>>CLEAN OK.<<<\n')
blacklist=wikipedia.Page(mysite,u'User:Sz-iwbot/tineye/blacklist')
for (imagepage, timestamp, user, comment) in mysite.newimages(number = 50):
# no have checked images
buser = u'User:'+user
userp=wikipedia.Page(mysite,buser)
if userp in blacklist.linkedPages():
wikipedia.output(u'Uploader %s in blacklist, pass...' % user)
bpass=True
else:
bpass=False
if imagepage.title() not in seen:
seen.add(imagepage.title())
# thumburl= mysite.protocol() + u"://" + mysite.hostname() + mysite.scriptpath() + u"/thumb.php?w="
try:
# only check these images
r= re.compile(r'.*?(?:gif|png|jpg|jpeg)$', re.IGNORECASE)
passtemplate= [u'Flickrreview', u'PD-old', u'RetouchedPicture', u'PD-1923', u'PD-URAA', u'Not-PD-US-URAA', u'PD-old-100', u'PD-old-80', u'PD-old-75', u'PD-old-70', u'PD-old-50', u'PD-user-en', u'PD-user-w', u'patent', u'anonymous work', 'anonymous-EU', u'PD-anon-1923', u'PD-Art', u'PD-retouched-user', u'User:Flickr upload bot/upload', u'BotMoveToCommons', u'PD-Art-YorckProject', u'PermissionOTRS', u'PD-Coa-Hungary', u'PD-USGov-NASA', u'PD-Polish', u'PD-USGov-USDA-ARS', u'PD-RusEmpire', u'PD-USGov', u'LOC-image', u'PD-Bain', u'Historical blank world maps', u'Duplicate', u'PD-LT-exempt', u'PD-USGov-Military-Army', u'PD-US', u'Location']
l= False
for pt in passtemplate:
if pt in imagepage.templates():
l = True
wikipedia.output(u'{{%s}} in imagepage, pass...' % pt)
break
if r.search(imagepage.title()).groups() == () and (l== False or imagepage.getFileVersionHistory() < 2):
#user group API data
params = {
'action' :'query',
'list' :'users',
'ususers' :user,
'usprop' :'groups',
}
data = query.GetData(params, useAPI = True, encodeTitle = False)
group = data['query']['users'][0]
# No check sysop or bot upload
if len(group) == 1 and bpass==False:
try:
# get image width
# resolutions = wikipedia.ImagePage.getFileVersionHistory(imagepage)[0][2]
params = {
'action' :'query',
'prop' :'imageinfo',
'titles' :imagepage.title(),
'iiprop' :'size|url',
}
idata = query.GetData(params, useAPI = True, encodeTitle = False)
try:
width = str(idata['query']['pages'].values()[0]['imageinfo'][0]['width'])
# thumburl=thumburl+width+u"&f="
if width > 300:
imagehtml = imagepage.getImagePageHtml()
rurl=re.compile(r'<div class="fullImageLink" id="file"><a href=".*?"><img alt=".*?" src="(?P<url>[^ ]+?)" width')
try:
thumburl = rurl.search(imagehtml).group('url')
tineyeurl="http://tineye.com/search?url="+urllib2.quote(thumburl)
wikipedia.output(u'Get tineye URL: %s' % tineyeurl)
html = urllib.urlopen(tineyeurl).read()
tineyetext=FormatPage(html)
bot(tineyetext, imagepage)
except wikipedia.NoPage:
wikipedia.output(u"Skipping [[%s]] because it has been deleted." % imagepage.title())
else:
try:
imageurl = idata['query']['pages'].values()[0]['imageinfo'][0]['url']
tineyeurl="http://tineye.com/search?url="+urllib2.quote(imageurl)
wikipedia.output(u'Get tineye URL: %s' % tineyeurl)
html = urllib.urlopen(tineyeurl).read()
tineyetext=FormatPage(html)
bot(tineyetext, imagepage)
except wikipedia.NoPage:
wikipedia.output(u"Skipping [[%s]] because it has been deleted." % imagepage.title())
except KeyError:
wikipedia.output(u'KeyError')
except IndexError:
wikipedia.output(u'ERROR: Not found resolution of image [[%s]]' % imagepage.title())
try:
imageurl = wikipedia.ImagePage.fileUrl(imagepage)
tineyeurl="http://tineye.com/search?url="+urllib2.quote(imageurl)
wikipedia.output(u'Get tineye URL: %s' % tineyeurl)
try:
html = urllib.urlopen(tineyeurl).read()
tineyetext=FormatPage(html)
bot(tineyetext, imagepage)
except IOError:
wikipedia.output(u"Skipping [[%s]] because IOError." % imagepage.title())
except wikipedia.NoPage:
wikipedia.output(u"Skipping [[%s]] because it has been deleted." % imagepage.title())
elif len(group) != 1:
wikipedia.output(u"Skipping [[%s]] because bot or sysop upload ." % imagepage.title())
except AttributeError:
wikipedia.output(u"Skipping [[%s]] because can\'t check at tineye." % imagepage.title())
wikipedia.stopme()