Reduce requests for existing items and remove whitespace: tested with wiki-smackdownneoseekercom_w

git-svn-id: https://wikiteam.googlecode.com/svn/trunk@939 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
pull/117/head
nemobis 11 years ago
parent 54f9798be0
commit b74d6f79ce

@ -6,12 +6,12 @@
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or # the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version. # (at your option) any later version.
# #
# This program is distributed in the hope that it will be useful, # This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details. # GNU General Public License for more details.
# #
# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
@ -92,107 +92,125 @@ def upload(wikis):
if dump in uploadeddumps: if dump in uploadeddumps:
print '%s was uploaded before, skipping...' % (dump) print '%s was uploaded before, skipping...' % (dump)
continue continue
time.sleep(0.1) time.sleep(0.1)
wikidate = dump.split('-')[1] wikidate = dump.split('-')[1]
wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8]
print wiki, wikiname, wikidate, dump print wiki, wikiname, wikidate, dump
#get metadata from api.php # Does the item exist already?
#first sitename and base url
headers = {'User-Agent': dumpgenerator.getUserAgent()} headers = {'User-Agent': dumpgenerator.getUserAgent()}
params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers)
data = urllib.urlencode(params) if urllib2.urlopen(itemdata).read() == '{}':
req = urllib2.Request(url=wiki, data=data, headers=headers) ismissingitem = True
xml = '' else:
try: ismissingitem = False
f = urllib2.urlopen(req)
xml = f.read() # We don't know a way to fix/overwrite metadata if item exists already:
f.close() # just pass bogus data and save some time
except: if ismissingitem:
pass #get metadata from api.php
#first sitename and base url
sitename = '' params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'}
baseurl = '' data = urllib.urlencode(params)
lang = '' req = urllib2.Request(url=wiki, data=data, headers=headers)
try: xml = ''
sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
except:
pass
try:
baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
except:
pass
try:
lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
except:
pass
if not sitename:
sitename = wikiname
if not baseurl:
baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
if lang:
lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower()
#now copyright info from API
params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
data = urllib.urlencode(params)
req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = ''
try:
f = urllib2.urlopen(req)
xml = f.read()
f.close()
except:
pass
rightsinfourl = ''
rightsinfotext = ''
try:
rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0]
rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
except:
pass
#or copyright info from #footer in mainpage
if baseurl and not rightsinfourl and not rightsinfotext:
raw = ''
try: try:
f = urllib.urlopen(baseurl) f = urllib2.urlopen(req)
raw = f.read() xml = f.read()
f.close() f.close()
except: except:
pass pass
rightsinfotext = ''
rightsinfourl = '' sitename = ''
baseurl = ''
lang = ''
try:
sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0]
except:
pass
try:
baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0]
except:
pass
try: try:
rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0] lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0]
except: except:
pass pass
if not sitename:
sitename = wikiname
if not baseurl:
baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki)
if lang:
lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower()
#now copyright info from API
params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'}
data = urllib.urlencode(params)
req = urllib2.Request(url=wiki, data=data, headers=headers)
xml = ''
try:
f = urllib2.urlopen(req)
xml = f.read()
f.close()
except:
pass
rightsinfourl = ''
rightsinfotext = ''
try: try:
rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0] rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0]
rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0]
except: except:
pass pass
if rightsinfotext and not rightsinfourl:
rightsinfourl = baseurl + '#footer' #or copyright info from #footer in mainpage
if baseurl and not rightsinfourl and not rightsinfotext:
#retrieve some info from the wiki raw = ''
wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia try:
wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." f = urllib.urlopen(baseurl)
wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki raw = f.read()
if not rightsinfourl and not rightsinfotext: f.close()
wikikeys.append('unknowncopyright') except:
pass
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ rightsinfotext = ''
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. rightsinfourl = ''
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php try:
rightsinfourl = re.findall(ur"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw)[0]
except:
pass
try:
rightsinfotext = re.findall(ur"<li id=\"copyright\">([^\n\r]*?)</li>", raw)[0]
except:
pass
if rightsinfotext and not rightsinfourl:
rightsinfourl = baseurl + '#footer'
#retrieve some info from the wiki
wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia
wikidesc = "<a href=\"%s\">%s</a> dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools." % (baseurl, sitename)# "<a href=\"http://en.ecgpedia.org/\" rel=\"nofollow\">ECGpedia,</a>: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with <a href=\"http://code.google.com/p/wikiteam/\" rel=\"nofollow\">WikiTeam</a> tools."
wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki
if not rightsinfourl and not rightsinfotext:
wikikeys.append('unknowncopyright')
wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/
wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found.
wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php
else:
lang = 'foo'
wikititle = 'foo'
wikidesc = 'foo'
wikikeys = 'foo'
wikilicenseurl = 'foo'
wikirights = 'foo'
wikiurl = 'foo'
#creates curl command #creates curl command
curl = ['curl', '--location', curl = ['curl', '--location',
'--header', "'x-amz-auto-make-bucket:1'", # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error '--header', "'x-amz-auto-make-bucket:1'", # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error
'--header', "'x-archive-queue-derive:0'", '--header', "'x-archive-queue-derive:0'",
'--header', "'x-archive-size-hint:%d'" % (os.path.getsize(dump)), '--header', "'x-archive-size-hint:%d'" % (os.path.getsize(dump)),
'--header', "'authorization: LOW %s:%s'" % (accesskey, secretkey), '--header', "'authorization: LOW %s:%s'" % (accesskey, secretkey),
] ]
if c == 0: if c == 0:
@ -207,7 +225,7 @@ def upload(wikis):
'--header', "'x-archive-meta-rights:%s'" % wikirights.replace("'", r"\'"), '--header', "'x-archive-meta-rights:%s'" % wikirights.replace("'", r"\'"),
'--header', quoteattr('x-archive-meta-originalurl:' + wikiurl), '--header', quoteattr('x-archive-meta-originalurl:' + wikiurl),
] ]
curl += ['--upload-file', "%s" % (dump), curl += ['--upload-file', "%s" % (dump),
"http://s3.us.archive.org/wiki-%s/%s" % (wikiname, dump), # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3. "http://s3.us.archive.org/wiki-%s/%s" % (wikiname, dump), # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3.
'> /dev/null', '> /dev/null',
@ -228,7 +246,8 @@ def upload(wikis):
if not os.system(curlline): if not os.system(curlline):
uploadeddumps.append(dump) uploadeddumps.append(dump)
log(wiki, dump, 'ok') log(wiki, dump, 'ok')
os.system(curlmetaline) if not ismissingitem:
os.system(curlmetaline)
c += 1 c += 1
def main(): def main():

Loading…
Cancel
Save