diff --git a/uploader.py b/uploader.py
index 1033a16..09512e9 100644
--- a/uploader.py
+++ b/uploader.py
@@ -65,6 +65,7 @@ def upload(wikis):
for dump in dumps:
time.sleep(0.1)
wikidate = dump.split('-')[1]
+ wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
print wiki, wikiname, wikidate, dump
#get metadata from api.php
@@ -73,14 +74,13 @@ def upload(wikis):
params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
data = urllib.urlencode(params)
req = urllib2.Request(url=wiki, data=data, headers=headers)
+ xml = ''
try:
f = urllib2.urlopen(req)
+ xml = f.read()
+ f.close()
except:
- print "Error while retrieving metadata from API, skiping this dump..."
- log(wiki, dump, 'missing metadata')
- continue
- xml = f.read()
- f.close()
+ pass
sitename = ''
baseurl = ''
@@ -90,6 +90,11 @@ def upload(wikis):
except:
pass
+ if not sitename:
+ sitename = wikiname
+ if not baseurl:
+ baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
+
#now copyright info from API
params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
data = urllib.urlencode(params)
@@ -111,7 +116,7 @@ def upload(wikis):
pass
#or copyright info from #footer in mainpage
- if not rightsinfourl and not rightsinfotext:
+ if baseurl and not rightsinfourl and not rightsinfotext:
f = urllib.urlopen(baseurl)
raw = f.read()
f.close()
@@ -128,15 +133,13 @@ def upload(wikis):
if rightsinfotext and not rightsinfourl:
rightsinfourl = baseurl + '#footer'
- if not sitename or not baseurl or not rightsinfourl or not rightsinfotext:
- print "Error while retrieving metadata from API, skiping this dump..."
- log(wiki, dump, 'missing metadata')
- continue
-
#retrieve some info from the wiki
wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
wikidesc = "%s dumped with WikiTeam tools." % (baseurl, sitename)# "ECGpedia,: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with WikiTeam tools."
wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
+ if not rightsinfourl and not rightsinfotext:
+ wikikeys.append('unknowncopyright')
+
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php