From b74d6f79ceb3680bb9407c8e563197ecac7c6682 Mon Sep 17 00:00:00 2001 From: nemobis Date: Sun, 2 Feb 2014 11:58:49 +0000 Subject: [PATCH] Reduce requests for existing items and remove whitespace: tested with wiki-smackdownneoseekercom_w git-svn-id: https://wikiteam.googlecode.com/svn/trunk@939 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95 --- uploader.py | 193 +++++++++++++++++++++++++++++----------------------- 1 file changed, 106 insertions(+), 87 deletions(-) diff --git a/uploader.py b/uploader.py index 7c66a29..f7427e3 100644 --- a/uploader.py +++ b/uploader.py @@ -6,12 +6,12 @@ # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see . @@ -92,107 +92,125 @@ def upload(wikis): if dump in uploadeddumps: print '%s was uploaded before, skipping...' % (dump) continue - + time.sleep(0.1) wikidate = dump.split('-')[1] wikidate_text = wikidate[0:4]+'-'+wikidate[4:6]+'-'+wikidate[6:8] print wiki, wikiname, wikidate, dump - - #get metadata from api.php - #first sitename and base url + + # Does the item exist already? headers = {'User-Agent': dumpgenerator.getUserAgent()} - params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} - data = urllib.urlencode(params) - req = urllib2.Request(url=wiki, data=data, headers=headers) - xml = '' - try: - f = urllib2.urlopen(req) - xml = f.read() - f.close() - except: - pass - - sitename = '' - baseurl = '' - lang = '' - try: - sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0] - except: - pass - try: - baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0] - except: - pass - try: - lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0] - except: - pass - - if not sitename: - sitename = wikiname - if not baseurl: - baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki) - if lang: - lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower() - - #now copyright info from API - params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'} - data = urllib.urlencode(params) - req = urllib2.Request(url=wiki, data=data, headers=headers) - xml = '' - try: - f = urllib2.urlopen(req) - xml = f.read() - f.close() - except: - pass - - rightsinfourl = '' - rightsinfotext = '' - try: - rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0] - rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0] - except: - pass - - #or copyright info from #footer in mainpage - if baseurl and not rightsinfourl and not rightsinfotext: - raw = '' + itemdata = urllib2.Request(url='http://archive.org/metadata/wiki-' + wikiname, headers=headers) + if urllib2.urlopen(itemdata).read() == '{}': + ismissingitem = True + else: + ismissingitem = False + + # We don't know a way to fix/overwrite metadata if item exists already: + # just pass bogus data and save some time + if ismissingitem: + #get metadata from api.php + #first sitename and base url + params = {'action': 'query', 'meta': 'siteinfo', 'format': 'xml'} + data = urllib.urlencode(params) + req = urllib2.Request(url=wiki, data=data, headers=headers) + xml = '' try: - f = urllib.urlopen(baseurl) - raw = f.read() + f = urllib2.urlopen(req) + xml = f.read() f.close() except: pass - rightsinfotext = '' - rightsinfourl = '' + + sitename = '' + baseurl = '' + lang = '' + try: + sitename = re.findall(ur"sitename=\"([^\"]+)\"", xml)[0] + except: + pass + try: + baseurl = re.findall(ur"base=\"([^\"]+)\"", xml)[0] + except: + pass try: - rightsinfourl = re.findall(ur"", raw)[0] + lang = re.findall(ur"lang=\"([^\"]+)\"", xml)[0] except: pass + + if not sitename: + sitename = wikiname + if not baseurl: + baseurl = re.sub(ur"(?im)/api\.php", ur"", wiki) + if lang: + lang = convertlang.has_key(lang.lower()) and convertlang[lang.lower()] or lang.lower() + + #now copyright info from API + params = {'action': 'query', 'siprop': 'general|rightsinfo', 'format': 'xml'} + data = urllib.urlencode(params) + req = urllib2.Request(url=wiki, data=data, headers=headers) + xml = '' + try: + f = urllib2.urlopen(req) + xml = f.read() + f.close() + except: + pass + + rightsinfourl = '' + rightsinfotext = '' try: - rightsinfotext = re.findall(ur"
  • ([^\n\r]*?)
  • ", raw)[0] + rightsinfourl = re.findall(ur"rightsinfo url=\"([^\"]+)\"", xml)[0] + rightsinfotext = re.findall(ur"text=\"([^\"]+)\"", xml)[0] except: pass - if rightsinfotext and not rightsinfourl: - rightsinfourl = baseurl + '#footer' - - #retrieve some info from the wiki - wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia - wikidesc = "%s dumped with WikiTeam tools." % (baseurl, sitename)# "ECGpedia,: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with WikiTeam tools." - wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki - if not rightsinfourl and not rightsinfotext: - wikikeys.append('unknowncopyright') - - wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ - wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. - wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php - + + #or copyright info from #footer in mainpage + if baseurl and not rightsinfourl and not rightsinfotext: + raw = '' + try: + f = urllib.urlopen(baseurl) + raw = f.read() + f.close() + except: + pass + rightsinfotext = '' + rightsinfourl = '' + try: + rightsinfourl = re.findall(ur"", raw)[0] + except: + pass + try: + rightsinfotext = re.findall(ur"
  • ([^\n\r]*?)
  • ", raw)[0] + except: + pass + if rightsinfotext and not rightsinfourl: + rightsinfourl = baseurl + '#footer' + + #retrieve some info from the wiki + wikititle = "Wiki - %s" % (sitename) # Wiki - ECGpedia + wikidesc = "%s dumped with WikiTeam tools." % (baseurl, sitename)# "ECGpedia,: a free electrocardiography (ECG) tutorial and textbook to which anyone can contribute, designed for medical professionals such as cardiac care nurses and physicians. Dumped with WikiTeam tools." + wikikeys = ['wiki', 'wikiteam', 'MediaWiki', sitename, wikiname] # ecg; ECGpedia; wiki; wikiteam; MediaWiki + if not rightsinfourl and not rightsinfotext: + wikikeys.append('unknowncopyright') + + wikilicenseurl = rightsinfourl # http://creativecommons.org/licenses/by-nc-sa/3.0/ + wikirights = rightsinfotext # e.g. http://en.ecgpedia.org/wiki/Frequently_Asked_Questions : hard to fetch automatically, could be the output of API's rightsinfo if it's not a usable licenseurl or "Unknown copyright status" if nothing is found. + wikiurl = wiki # we use api here http://en.ecgpedia.org/api.php + else: + lang = 'foo' + wikititle = 'foo' + wikidesc = 'foo' + wikikeys = 'foo' + wikilicenseurl = 'foo' + wikirights = 'foo' + wikiurl = 'foo' + #creates curl command - curl = ['curl', '--location', + curl = ['curl', '--location', '--header', "'x-amz-auto-make-bucket:1'", # Creates the item automatically, need to give some time for the item to correctly be created on archive.org, or everything else will fail, showing "bucket not found" error '--header', "'x-archive-queue-derive:0'", - '--header', "'x-archive-size-hint:%d'" % (os.path.getsize(dump)), + '--header', "'x-archive-size-hint:%d'" % (os.path.getsize(dump)), '--header', "'authorization: LOW %s:%s'" % (accesskey, secretkey), ] if c == 0: @@ -207,7 +225,7 @@ def upload(wikis): '--header', "'x-archive-meta-rights:%s'" % wikirights.replace("'", r"\'"), '--header', quoteattr('x-archive-meta-originalurl:' + wikiurl), ] - + curl += ['--upload-file', "%s" % (dump), "http://s3.us.archive.org/wiki-%s/%s" % (wikiname, dump), # It could happen that the identifier is taken by another user; only wikiteam collection admins will be able to upload more files to it, curl will fail immediately and get a permissions error by s3. '> /dev/null', @@ -228,7 +246,8 @@ def upload(wikis): if not os.system(curlline): uploadeddumps.append(dump) log(wiki, dump, 'ok') - os.system(curlmetaline) + if not ismissingitem: + os.system(curlmetaline) c += 1 def main():