@ -98,95 +98,113 @@ def upload(wikis):
wikidate_text = wikidate [ 0 : 4 ] + ' - ' + wikidate [ 4 : 6 ] + ' - ' + wikidate [ 6 : 8 ]
print wiki , wikiname , wikidate , dump
#get metadata from api.php
#first sitename and base url
# Does the item exist already?
headers = { ' User-Agent ' : dumpgenerator . getUserAgent ( ) }
params = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
try :
f = urllib2 . urlopen ( req )
xml = f . read ( )
f . close ( )
except :
pass
sitename = ' '
baseurl = ' '
lang = ' '
try :
sitename = re . findall ( ur " sitename= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
try :
baseurl = re . findall ( ur " base= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
try :
lang = re . findall ( ur " lang= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
if not sitename :
sitename = wikiname
if not baseurl :
baseurl = re . sub ( ur " (?im)/api \ .php " , ur " " , wiki )
if lang :
lang = convertlang . has_key ( lang . lower ( ) ) and convertlang [ lang . lower ( ) ] or lang . lower ( )
#now copyright info from API
params = { ' action ' : ' query ' , ' siprop ' : ' general|rightsinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
try :
f = urllib2 . urlopen ( req )
xml = f . read ( )
f . close ( )
except :
pass
rightsinfourl = ' '
rightsinfotext = ' '
try :
rightsinfourl = re . findall ( ur " rightsinfo url= \" ([^ \" ]+) \" " , xml ) [ 0 ]
rightsinfotext = re . findall ( ur " text= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
#or copyright info from #footer in mainpage
if baseurl and not rightsinfourl and not rightsinfotext :
raw = ' '
itemdata = urllib2 . Request ( url = ' http://archive.org/metadata/wiki- ' + wikiname , headers = headers )
if urllib2 . urlopen ( itemdata ) . read ( ) == ' {} ' :
ismissingitem = True
else :
ismissingitem = False
# We don't know a way to fix/overwrite metadata if item exists already:
# just pass bogus data and save some time
if ismissingitem :
#get metadata from api.php
#first sitename and base url
params = { ' action ' : ' query ' , ' meta ' : ' siteinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
try :
f = urllib . urlopen ( baseurl )
raw = f . read ( )
f = urllib2 . urlopen ( req )
xml = f . read ( )
f . close ( )
except :
pass
rightsinfotext = ' '
rightsinfourl = ' '
sitename = ' '
baseurl = ' '
lang = ' '
try :
sitename = re . findall ( ur " sitename= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
try :
baseurl = re . findall ( ur " base= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
try :
rightsinfourl = re . findall ( ur " <link rel= \" copyright \" href= \" ([^ \" ]+) \" /> " , raw ) [ 0 ]
lang = re . findall ( ur " lang= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
if not sitename :
sitename = wikiname
if not baseurl :
baseurl = re . sub ( ur " (?im)/api \ .php " , ur " " , wiki )
if lang :
lang = convertlang . has_key ( lang . lower ( ) ) and convertlang [ lang . lower ( ) ] or lang . lower ( )
#now copyright info from API
params = { ' action ' : ' query ' , ' siprop ' : ' general|rightsinfo ' , ' format ' : ' xml ' }
data = urllib . urlencode ( params )
req = urllib2 . Request ( url = wiki , data = data , headers = headers )
xml = ' '
try :
f = urllib2 . urlopen ( req )
xml = f . read ( )
f . close ( )
except :
pass
rightsinfourl = ' '
rightsinfotext = ' '
try :
rightsinfotext = re . findall ( ur " <li id= \" copyright \" >([^ \n \r ]*?)</li> " , raw ) [ 0 ]
rightsinfourl = re . findall ( ur " rightsinfo url= \" ([^ \" ]+) \" " , xml ) [ 0 ]
rightsinfotext = re . findall ( ur " text= \" ([^ \" ]+) \" " , xml ) [ 0 ]
except :
pass
if rightsinfotext and not rightsinfourl :
rightsinfourl = baseurl + ' #footer '
#retrieve some info from the wiki
wikititle = " Wiki - %s " % ( sitename ) # Wiki - ECGpedia
wikidesc = " <a href= \" %s \" > %s </a> dumped with <a href= \" http://code.google.com/p/wikiteam/ \" rel= \" nofollow \" >WikiTeam</a> tools. " % ( baseurl , sitename ) # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools."
wikikeys = [ ' wiki ' , ' wikiteam ' , ' MediaWiki ' , sitename , wikiname ] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
if not rightsinfourl and not rightsinfotext :
wikikeys . append ( ' unknowncopyright ' )
#or copyright info from #footer in mainpage
if baseurl and not rightsinfourl and not rightsinfotext :
raw = ' '
try :
f = urllib . urlopen ( baseurl )
raw = f . read ( )
f . close ( )
except :
pass
rightsinfotext = ' '
rightsinfourl = ' '
try :
rightsinfourl = re . findall ( ur " <link rel= \" copyright \" href= \" ([^ \" ]+) \" /> " , raw ) [ 0 ]
except :
pass
try :
rightsinfotext = re . findall ( ur " <li id= \" copyright \" >([^ \n \r ]*?)</li> " , raw ) [ 0 ]
except :
pass
if rightsinfotext and not rightsinfourl :
rightsinfourl = baseurl + ' #footer '
#retrieve some info from the wiki
wikititle = " Wiki - %s " % ( sitename ) # Wiki - ECGpedia
wikidesc = " <a href= \" %s \" > %s </a> dumped with <a href= \" http://code.google.com/p/wikiteam/ \" rel= \" nofollow \" >WikiTeam</a> tools. " % ( baseurl , sitename ) # "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools."
wikikeys = [ ' wiki ' , ' wikiteam ' , ' MediaWiki ' , sitename , wikiname ] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
if not rightsinfourl and not rightsinfotext :
wikikeys . append ( ' unknowncopyright ' )
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
else :
lang = ' foo '
wikititle = ' foo '
wikidesc = ' foo '
wikikeys = ' foo '
wikilicenseurl = ' foo '
wikirights = ' foo '
wikiurl = ' foo '
#creates curl command
curl = [ ' curl ' , ' --location ' ,
@ -228,7 +246,8 @@ def upload(wikis):
if not os . system ( curlline ) :
uploadeddumps . append ( dump )
log ( wiki , dump , ' ok ' )
os . system ( curlmetaline )
if not ismissingitem :
os . system ( curlmetaline )
c + = 1
def main ( ) :