From c8e11a949b4c1e0bfa95c09794f2e932d5e53387 Mon Sep 17 00:00:00 2001 From: balr0g Date: Mon, 30 Jun 2014 20:14:44 -0400 Subject: [PATCH 1/3] Initial port to Requests --- dumpgenerator.py | 372 ++++++++++++++++------------------------------- 1 file changed, 127 insertions(+), 245 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 3d62e65..1ad7065 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -24,51 +24,29 @@ import cPickle import datetime import argparse import json -import gzip try: from hashlib import md5 except ImportError: # Python 2.4 compatibility from md5 import new as md5 import os import re -import StringIO +import requests import subprocess import sys import time import urllib -import urllib2 __VERSION__ = '0.2.2' #major, minor, micro def getVersion(): return(__VERSION__) -# This class is from https://github.com/crustymonkey/py-sonic/blob/master/libsonic/connection.py#L50 -class POSTHTTPRedirectHandler(urllib2.HTTPRedirectHandler): - def redirect_request(self, req, fp, code, msg, headers, newurl): - m = req.get_method() - if (code in (301, 302, 303, 307) and m in ("GET", "HEAD") - or code in (301, 302, 303, 307) and m == "POST"): - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type") - ) - data = None - if req.has_data(): - data = req.get_data() - return urllib2.Request(newurl, - data=data, - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True) - else: - raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) def truncateFilename(other={}, filename=''): """ Truncate filenames when downloading images with large filenames """ return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1] -def delay(config={}): +def delay(config={}, session=None): """ Add a delay if configured for that """ if config['delay'] > 0: print 'Sleeping... %d seconds...' % (config['delay']) @@ -95,20 +73,15 @@ def cleanHTML(raw=''): sys.exit() return raw -def getNamespacesScraper(config={}): +def getNamespacesScraper(config={}, session=None): """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """ """ Function called if no API is available """ namespaces = config['namespaces'] namespacenames = {0:''} # main is 0, no prefix if namespaces: - req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Allpages', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() - else: - raw = f.read() - f.close() - delay(config=config) + r = session.post(url=config['index'], data={'title': 'Special:Allpages', }, headers={'User-Agent': getUserAgent()}) + raw = r.text + delay(config=config, session=session) m = re.compile(r'').finditer(raw) # [^>]*? to include selected="selected" if 'all' in namespaces: @@ -131,19 +104,14 @@ def getNamespacesScraper(config={}): print '%d namespaces found' % (len(namespaces)) return namespaces, namespacenames -def getNamespacesAPI(config={}): +def getNamespacesAPI(config={}, session=None): """ Uses the API to get the list of namespaces names and ids """ namespaces = config['namespaces'] namespacenames = {0:''} # main is 0, no prefix if namespaces: - req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()) - else: - result = json.loads(f.read()) - f.close() - delay(config=config) + r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, headers={'User-Agent': getUserAgent()}) + result = json.loads(r.text) + delay(config=config, session=session) if 'all' in namespaces: namespaces = [] @@ -169,10 +137,10 @@ def getNamespacesAPI(config={}): print '%d namespaces found' % (len(namespaces)) return namespaces, namespacenames -def getPageTitlesAPI(config={}): +def getPageTitlesAPI(config={}, session=None): """ Uses the API to get the list of page titles """ titles = [] - namespaces, namespacenames = getNamespacesAPI(config=config) + namespaces, namespacenames = getNamespacesAPI(config=config, session=session) for namespace in namespaces: if namespace in config['exnamespaces']: print ' Skipping namespace = %d' % (namespace) @@ -180,29 +148,14 @@ def getPageTitlesAPI(config={}): c = 0 print ' Retrieving titles in the namespace %d' % (namespace) - headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'} + headers = {'User-Agent': getUserAgent()} apfrom = '!' while apfrom: sys.stderr.write('.') #progress params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} - data = urllib.urlencode(params) - req = urllib2.Request(url=config['api'], data=data, headers=headers) - try: - f = urllib2.urlopen(req) - except: - try: - print '(1) Server is slow... Waiting some seconds and retrying...' - time.sleep(10) - f = urllib2.urlopen(req) - except: - print 'An error has occurred while retrieving page titles with API' - print 'Please, resume the dump, --resume' - sys.exit() - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - jsontitles = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8')) - else: - jsontitles = json.loads(unicode(f.read(), 'utf-8')) - f.close() + r = session.post(url=config['api'], data=params, headers=headers) + #FIXME Handle HTTP errors here! + jsontitles = json.loads(r.text) apfrom = '' if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'): if jsontitles['query-continue']['allpages'].has_key('apcontinue'): @@ -218,23 +171,19 @@ def getPageTitlesAPI(config={}): titles = list(set(titles)) apfrom = '' c += len(jsontitles['query']['allpages']) - delay(config=config) + delay(config=config, session=session) print ' %d titles retrieved in the namespace %d' % (c, namespace) return titles -def getPageTitlesScraper(config={}): +def getPageTitlesScraper(config={}, session=None): """ """ titles = [] - namespaces, namespacenames = getNamespacesScraper(config=config) + namespaces, namespacenames = getNamespacesScraper(config=config, session=session) for namespace in namespaces: print ' Retrieving titles in the namespace', namespace url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) - req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() - else: - raw = f.read() + r = session.get(url=url, headers={'User-Agent': getUserAgent()}) + raw = r.text raw = cleanHTML(raw) r_title = r'title="(?P[^>]+)">' @@ -269,24 +218,20 @@ def getPageTitlesScraper(config={}): if not name in checked_suballpages: checked_suballpages.append(name) #to avoid reload dupe subpages links - delay(config=config) - req2 = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req2) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - raw2 = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() - else: - raw2 = f.read() + delay(config=config, session=session) + r2 = session.get(url=url, headers={'User-Agent': getUserAgent()}) + raw2 = r2.text raw2 = cleanHTML(raw2) rawacum += raw2 #merge it after removed junk print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages' - delay(config=config) + delay(config=config, session=session) c += 1 c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: - t = undoHTMLEntities(text=unicode(i.group('title'), 'utf-8')) + t = undoHTMLEntities(text=i.group('title')) if not t.startswith('Special:'): if not t in titles: titles.append(t) @@ -294,7 +239,7 @@ def getPageTitlesScraper(config={}): print ' %d titles retrieved in the namespace %d' % (c, namespace) return titles -def getPageTitles(config={}): +def getPageTitles(config={}, session=None): """ Get list of page titles """ #http://en.wikipedia.org/wiki/Special:AllPages #http://archiveteam.org/index.php?title=Special:AllPages @@ -304,9 +249,9 @@ def getPageTitles(config={}): titles = [] if config['api']: - titles = getPageTitlesAPI(config=config) + titles = getPageTitlesAPI(config=config, session=session) elif config['index']: - titles = getPageTitlesScraper(config=config) + titles = getPageTitlesScraper(config=config, session=session) titles = list(set(titles)) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace)) titles.sort() #sorting @@ -314,22 +259,22 @@ def getPageTitles(config={}): print '%d page titles loaded' % (len(titles)) return titles -def getXMLHeader(config={}): +def getXMLHeader(config={}, session=None): """ Retrieve a random page to extract XML headers (namespace info, etc) """ #get the header of a random page, to attach it in the complete XML backup #similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x.... randomtitle = 'Main_Page' #previously AMF5LKE43MNFGHKSDMRTJ - xml = getXMLPage(config=config, title=randomtitle, verbose=False) + xml = getXMLPage(config=config, title=randomtitle, verbose=False, session=session) header = xml.split('</mediawiki>')[0] if not xml: print 'XML export on this wiki is broken, quitting.' sys.exit() return header -def getXMLFileDesc(config={}, title=''): +def getXMLFileDesc(config={}, title='', session=None): """ Get XML for image description page """ config['curonly'] = 1 #tricky to get only the most recent desc - return getXMLPage(config=config, title=title, verbose=False) + return getXMLPage(config=config, title=title, verbose=False, session=session) def getUserAgent(): """ Return a cool user-agent to hide Python user-agent """ @@ -348,7 +293,7 @@ def logerror(config={}, text=''): f.write(output.encode('utf-8')) f.close() -def getXMLPageCore(headers={}, params={}, config={}): +def getXMLPageCore(headers={}, params={}, config={}, session=None): """ """ #returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki> #if retrieving params['limit'] revisions fails, returns a current only version @@ -380,30 +325,14 @@ def getXMLPageCore(headers={}, params={}, config={}): print ' Saving in the errors log, and skipping...' logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (params['pages'])) return '' # empty xml - - data = urllib.urlencode(params) - req = urllib2.Request(url=config['index'], data=data, headers=headers) - try: - f = urllib2.urlopen(req) - except: - try: - print '(2) Server is slow... Waiting some seconds and retrying...' - time.sleep(15) - f = urllib2.urlopen(req) - except: - print 'An error has occurred while retrieving "%s"' % (params['pages']) - print 'Please, resume the dump, --resume' - sys.exit() - # The error is usually temporary, but we exit the dump altogether. - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - xml = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() - else: - xml = f.read() + #FIXME HANDLE HTTP Errors HERE + r = session.post(url=config['index'], data=params, headers=headers) + xml = r.text c += 1 return xml -def getXMLPage(config={}, title='', verbose=True): +def getXMLPage(config={}, title='', verbose=True, session=None): """ Get the full history (or current only) of a page """ #if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated @@ -414,8 +343,8 @@ def getXMLPage(config={}, title='', verbose=True): title_ = title title_ = re.sub(' ', '_', title_) #do not convert & into %26, title_ = re.sub('&', '%26', title_) - headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'} - params = {'title': 'Special:Export', 'pages': title_.encode('utf-8'), 'action': 'submit', } + headers = {'User-Agent': getUserAgent()} + params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit', } if config['curonly']: params['curonly'] = 1 params['limit'] = 1 @@ -425,7 +354,7 @@ def getXMLPage(config={}, title='', verbose=True): if config.has_key('templates') and config['templates']: #in other case, do not set params['templates'] params['templates'] = 1 - xml = getXMLPageCore(headers=headers, params=params, config=config) + xml = getXMLPageCore(headers=headers, params=params, config=config, session=session) #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available #else, warning about Special:Export truncating large page histories @@ -433,7 +362,7 @@ def getXMLPage(config={}, title='', verbose=True): if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one while not truncated and params['offset']: #next chunk params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML - xml2 = getXMLPageCore(headers=headers, params=params, config=config) + xml2 = getXMLPageCore(headers=headers, params=params, config=config, session=session) if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>? if re.findall(r_timestamp, xml2)[-1] == params['offset']: @@ -475,11 +404,11 @@ def cleanXML(xml=''): xml = xml.split('</mediawiki>')[0] return xml -def generateXMLDump(config={}, titles=[], start=''): +def generateXMLDump(config={}, titles=[], start='', session=None): """ Generates a XML dump for a list of titles """ print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') - header = getXMLHeader(config=config) + header = getXMLHeader(config=config, session=session) footer = '</mediawiki>\n' #new line at the end xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history') xmlfile = '' @@ -508,7 +437,7 @@ def generateXMLDump(config={}, titles=[], start=''): #requested complete xml dump lock = False xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') - xmlfile.write(header) + xmlfile.write(header.encode('utf-8')) xmlfile.close() xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') @@ -520,17 +449,17 @@ def generateXMLDump(config={}, titles=[], start=''): lock = False if lock: continue - delay(config=config) + delay(config=config, session=session) if c % 10 == 0: print 'Downloaded %d pages' % (c) - xml = getXMLPage(config=config, title=title) + xml = getXMLPage(config=config, title=title, session=session) xml = cleanXML(xml=xml) if not xml: logerror(config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title)) #here, XML is a correct <page> </page> chunk or #an empty string due to a deleted page (logged in errors log) or #an empty string due to an error while retrieving the page from server (logged in errors log) - xmlfile.write(xml) + xmlfile.write(xml.encode('utf-8')) c += 1 xmlfile.write(footer) xmlfile.close() @@ -547,18 +476,18 @@ def saveTitles(config={}, titles=[]): print 'Titles saved at...', titlesfilename -def saveImageFilenamesURL(config={}, images=[]): +def saveImageFilenamesURL(config={}, images=[], session=None): """ Save image list in a file, including filename, url and uploader """ imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date']) imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') - output = u"%s\n--END--" % (u'\n'.join([u'%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images])) - imagesfile.write(output.encode('utf-8')) + imagesfile.write(('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]).encode('utf-8'))) + imagesfile.write('\n--END--') imagesfile.close() print 'Image filenames and URLs saved at...', imagesfilename -def getImageFilenamesURL(config={}): +def getImageFilenamesURL(config={}, session=None): """ Retrieve file list: filename, url, uploader """ print 'Retrieving image filenames' @@ -569,14 +498,9 @@ def getImageFilenamesURL(config={}): retries = 5 while offset: #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - raw = unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8') - else: - raw = unicode(f.read(), 'utf-8') - f.close() - delay(config=config) + r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }, headers={'User-Agent': getUserAgent()}) + raw = r.text + delay(config=config, session=session) if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki if limit > 10: print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit) @@ -645,36 +569,20 @@ def getImageFilenamesURL(config={}): images.sort() return images -def getImageFilenamesURLAPI(config={}): +def getImageFilenamesURLAPI(config={}, session=None): """ Retrieve file list: filename, url, uploader """ print 'Retrieving image filenames' - headers = {'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'} + headers = {'User-Agent': getUserAgent()} aifrom = '!' images = [] while aifrom: sys.stderr.write('.') #progress params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} - data = urllib.urlencode(params) - req = urllib2.Request(url=config['api'], data=data, headers=headers) - try: - f = urllib2.urlopen(req) - except: - try: - print '(3) Server is slow... Waiting some seconds and retrying...' - time.sleep(10) - f = urllib2.urlopen(req) - except: - print 'An error has occurred while retrieving page titles with API' - print 'Please, resume the dump, --resume' - sys.exit() - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - jsonimages = json.loads(unicode(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(), 'utf-8')) - else: - jsonimages = json.loads(unicode(f.read(), 'utf-8')) - f.close() - #print jsonimages - delay(config=config) + #FIXME Handle HTTP Errors HERE + r = session.post(url=config['api'], data=params, headers=headers) + jsonimages = json.loads(r.text) + delay(config=config, session=session) aifrom = '' if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'): if jsonimages['query-continue']['allimages'].has_key('aicontinue'): @@ -691,13 +599,8 @@ def getImageFilenamesURLAPI(config={}): domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain url = u'%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url url = re.sub(' ', '_', url) - if image.has_key('name'): - #some API returns image name http://hastur.net/w/api.php?action=query&list=allimages&aiprop=user|url&ailimit=10 - filename = re.sub('_', ' ', image['name']) - else: - #other not http://wiki.annotation.jp/api.php?action=query&list=allimages&aiprop=user|url&ailimit=10 - #tips for dealing with unquote http://stackoverflow.com/questions/5139249/python-url-unquote-unicode - filename = re.sub('_', ' ', unicode(urllib2.unquote(url.encode('ascii')).split('/')[-1], 'utf-8')) + # encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136 + filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8') uploader = re.sub('_', ' ', image['user']) images.append([filename, url, uploader]) @@ -720,7 +623,7 @@ def undoHTMLEntities(text=''): return text -def generateImageDump(config={}, other={}, images=[], start=''): +def generateImageDump(config={}, other={}, images=[], start='', session=None): """ Save files and descriptions using a file list """ #fix use subdirectories md5 @@ -739,11 +642,11 @@ def generateImageDump(config={}, other={}, images=[], start=''): lock = False if lock: continue - delay(config=config) + delay(config=config, session=session) #saving file #truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max) - filename2 = filename + filename2 = urllib.unquote(filename) if len(filename2) > other['filenamelimit']: # split last . (extension) and then merge filename2 = truncateFilename(other=other, filename=filename2) @@ -761,21 +664,21 @@ def generateImageDump(config={}, other={}, images=[], start=''): # TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works? #saving description if any - xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename)) # use Image: for backwards compatibility + xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility f = open('%s/%s.desc' % (imagepath, filename2), 'w') if not re.search(r'</mediawiki>', xmlfiledesc): #<text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text> #failure when retrieving desc? then save it as empty .desc xmlfiledesc = '' - f.write(xmlfiledesc) + f.write(xmlfiledesc.encode('utf-8')) f.close() - delay(config=config) + delay(config=config, session=session) c += 1 if c % 10 == 0: print ' Downloaded %d images' % (c) print 'Downloaded %d images' % (c) -def saveLogs(config={}): +def saveLogs(config={}, session=None): """ Save Special:Log """ #get all logs from Special:Log """parse @@ -793,9 +696,9 @@ def saveLogs(config={}): <option value="">Todos los registros</option> </select> """ - delay(config=config) + delay(config=config, session=session) -def domain2prefix(config={}): +def domain2prefix(config={}, session=None): """ Convert domain name to a valid prefix filename. """ # At this point, both api and index are supposed to be defined @@ -966,6 +869,15 @@ def getParameters(params=[]): else: index = args.index + cj = cookielib.MozillaCookieJar() + if args.cookies: + cj.load(args.cookies) + print 'Using cookies from %s' % args.cookies + + session = requests.Session() + session.cookies = cj + session.headers = {'User-Agent': getUserAgent()} + config = { 'curonly': args.curonly, 'date': datetime.datetime.now().strftime('%Y%m%d'), @@ -984,18 +896,12 @@ def getParameters(params=[]): 'resume': args.resume, 'filenamelimit': 100, #do not change 'force': args.force, + 'session': session } - - if config['cookies']: - cj = cookielib.MozillaCookieJar() - cj.load(config['cookies']) - opener = urllib2.build_opener(POSTHTTPRedirectHandler, urllib2.HTTPCookieProcessor(cj)) - urllib2.install_opener(opener) - print 'Using cookies from %s' % config['cookies'] if config['api']: #check api.php - if checkAPI(config['api'], config): + if checkAPI(config['api'], config, session=other['session']): print 'api.php is OK' else: print 'Error in api.php, please, provide a correct path to api.php' @@ -1003,7 +909,7 @@ def getParameters(params=[]): if config['index']: #check index.php - if checkIndexphp(config['index'], config): + if checkIndexphp(config['index'], config, session=other['session']): print 'index.php is OK' else: print 'Error in index.php, please, provide a correct path to index.php' @@ -1011,39 +917,29 @@ def getParameters(params=[]): #calculating path, if not defined by user with --path= if not config['path']: - config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date']) + config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config, session=session), config['date']) return config, other -def checkAPI(api, config={}): +def checkAPI(api, config={}, session=None): """ Checking API availability """ - req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - resultText = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() - else: - resultText = f.read() - f.close() + global cj + r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()}) + resultText = r.text print 'Checking api.php...', api if "MediaWiki API is not enabled for this site." in resultText: return False result = json.loads(resultText) - delay(config=config) + delay(config=config, session=session) if result.has_key('query'): return True return False -def checkIndexphp(indexphp, config={}): +def checkIndexphp(indexphp, config={}, session=None): """ Checking index.php availability """ - - req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() - else: - raw = f.read() - f.close() - delay(config=config) + r = session.post(url=indexphp, data={'title': 'Special:Version'}, headers={'User-Agent': getUserAgent()}) + raw = r.text + delay(config=config, session=session) print 'Checking index.php...', indexphp if re.search(r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)', raw) and not config['cookies']: # Workaround for issue 71 print "ERROR: This wiki requires login and we are not authenticated" @@ -1062,7 +958,7 @@ def removeIP(raw=''): return raw -def checkXMLIntegrity(config={}): +def checkXMLIntegrity(config={}, session=None): """ Check XML dump integrity, to detect broken XML chunks """ return @@ -1072,7 +968,7 @@ def checkXMLIntegrity(config={}): checkpageclose = 0 checkrevisionopen = 0 checkrevisionclose = 0 - for line in file('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r').read().splitlines(): + for line in file('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r').read().splitlines(): if "<revision>" in line: checkrevisionopen += 1 elif "</revision>" in line: @@ -1103,19 +999,19 @@ def createNewDump(config={}, other={}): images = [] print 'Trying generating a new dump into a new directory...' if config['xml']: - titles += getPageTitles(config=config) + titles += getPageTitles(config=config, session=other['session']) saveTitles(config=config, titles=titles) - generateXMLDump(config=config, titles=titles) + generateXMLDump(config=config, titles=titles, session=other['session']) checkXMLIntegrity(config=config) if config['images']: if config['api']: - images += getImageFilenamesURLAPI(config=config) + images += getImageFilenamesURLAPI(config=config, session=other['session']) else: - images += getImageFilenamesURL(config=config) - saveImageFilenamesURL(config=config, images=images) - generateImageDump(config=config, other=other, images=images) + images += getImageFilenamesURL(config=config, session=other['session']) + saveImageFilenamesURL(config=config, images=images, session=other['session']) + generateImageDump(config=config, other=other, images=images, session=other['session']) if config['logs']: - saveLogs(config=config) + saveLogs(config=config, session=session) def resumePreviousDump(config={}, other={}): titles = [] @@ -1125,7 +1021,7 @@ def resumePreviousDump(config={}, other={}): #load titles lasttitle = '' try: - f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') + f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r') raw = unicode(f.read(), 'utf-8') titles = raw.split('\n') lasttitle = titles[-1] @@ -1140,13 +1036,13 @@ def resumePreviousDump(config={}, other={}): else: print 'Title list is incomplete. Reloading...' #do not resume, reload, to avoid inconsistences, deleted pages or so - titles = getPageTitles(config=config) + titles = getPageTitles(config=config, session=other['session']) saveTitles(config=config, titles=titles) #checking xml dump xmliscomplete = False lastxmltitle = '' try: - f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history'), 'r') + f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r') for l in f: if re.findall('</mediawiki>', l): #xml dump is complete @@ -1176,7 +1072,7 @@ def resumePreviousDump(config={}, other={}): #load images lastimage = '' try: - f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') + f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r') raw = unicode(f.read(), 'utf-8').strip() lines = raw.split('\n') for l in lines: @@ -1192,9 +1088,9 @@ def resumePreviousDump(config={}, other={}): print 'Image list is incomplete. Reloading...' #do not resume, reload, to avoid inconsistences, deleted images or so if config['api']: - images=getImageFilenamesURLAPI(config=config) + images=getImageFilenamesURLAPI(config=config, session=session) else: - images = getImageFilenamesURL(config=config) + images = getImageFilenamesURL(config=config, session=session) saveImageFilenamesURL(config=config, images=images) #checking images directory listdir = [] @@ -1228,47 +1124,37 @@ def resumePreviousDump(config={}, other={}): #fix pass -def saveSpecialVersion(config={}): +def saveSpecialVersion(config={}, session=None): """ Save Special:Version as .html, to preserve extensions details """ if os.path.exists('%s/Special:Version.html' % (config['path'])): print 'Special:Version.html exists, do not overwrite' else: print 'Downloading Special:Version with extensions and other related info' - req = urllib2.Request(url=config['index'], data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() - else: - raw = f.read() - f.close() - delay(config=config) + r = session.post(url=config['index'], data={'title': 'Special:Version', }, headers={'User-Agent': getUserAgent()}) + raw = r.text + delay(config=config, session=session) raw = removeIP(raw=raw) f = open('%s/Special:Version.html' % (config['path']), 'w') - f.write(raw) + f.write(raw.encode('utf-8')) f.close() -def saveIndexPHP(config={}): +def saveIndexPHP(config={}, session=None): """ Save index.php as .html, to preserve license details available at the botom of the page """ if os.path.exists('%s/index.html' % (config['path'])): print 'index.html exists, do not overwrite' else: print 'Downloading index.php (Main Page) as index.html' - req = urllib2.Request(url=config['index'], data=urllib.urlencode({}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() - else: - raw = f.read() - f.close() - delay(config=config) + r = session.post(url=config['index'], data={}, headers={'User-Agent': getUserAgent()}) + raw = r.text + delay(config=config, session=session) raw = removeIP(raw=raw) f = open('%s/index.html' % (config['path']), 'w') - f.write(raw) + f.write(raw.encode('utf-8')) f.close() -def saveSiteInfo(config={}): +def saveSiteInfo(config={}, session=None): """ Save a file with site info """ if config['api']: @@ -1276,14 +1162,9 @@ def saveSiteInfo(config={}): print 'siteinfo.json exists, do not overwrite' else: print 'Downloading site info as siteinfo.json' - req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'}) - f = urllib2.urlopen(req) - if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'): - result = json.loads(gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()) - else: - result = json.loads(f.read()) - f.close() - delay(config=config) + r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()}) + result = json.loads(r.text) + delay(config=config, session=session) f = open('%s/siteinfo.json' % (config['path']), 'w') f.write(json.dumps(result, indent=4, sort_keys=True)) f.close() @@ -1324,6 +1205,7 @@ def main(params=[]): """ Main function """ configfilename = 'config.txt' + session = requests.Session() config, other = getParameters(params=params) avoidWikimediaProjects(config=config, other=other) @@ -1364,9 +1246,9 @@ def main(params=[]): else: createNewDump(config=config, other=other) - saveIndexPHP(config=config) - saveSpecialVersion(config=config) - saveSiteInfo(config=config) + saveIndexPHP(config=config, session=session) + saveSpecialVersion(config=config, session=session) + saveSiteInfo(config=config, session=session) bye() if __name__ == "__main__": From 9aa3c4a0e1d8ecb8ade5814b9f6f226a418d368c Mon Sep 17 00:00:00 2001 From: balr0g <balrog032@gmail.com> Date: Tue, 1 Jul 2014 13:26:57 -0400 Subject: [PATCH 2/3] Removed all traces of urllib except for encode/decode; more bugs fixed. --- dumpgenerator.py | 129 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 87 insertions(+), 42 deletions(-) diff --git a/dumpgenerator.py b/dumpgenerator.py index 1ad7065..ff5e952 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -22,7 +22,12 @@ import cookielib import cPickle import datetime -import argparse +import sys +try: + import argparse +except ImportError: + print "Please install the argparse module." + sys.exit(1) import json try: from hashlib import md5 @@ -30,9 +35,12 @@ except ImportError: # Python 2.4 compatibility from md5 import new as md5 import os import re -import requests +try: + import requests +except ImportError: + print "Please install or update the Requests module." + sys.exit(1) import subprocess -import sys import time import urllib @@ -73,13 +81,45 @@ def cleanHTML(raw=''): sys.exit() return raw +def handleStatusCode(response): + statuscode = response.status_code + if statuscode >= 200 and statuscode < 300: + return + + print "HTTP Error %d." % statuscode + if statuscode >= 300 and statuscode < 400: + print "Redirect should happen automatically: please report this as a bug." + print response.url + + elif statuscode == 400: + print "Bad Request: The wiki may be malfunctioning." + print "Please try again later." + print response.url + sys.exit(1) + + elif statuscode == 401 or statuscode == 403: + print "Authentication required." + print "Please use --userpass." + print response.url + + elif statuscode == 404: + print "Not found. Is Special:Export enabled for this wiki?" + print response.url + sys.exit(1) + + elif statuscode == 429 or (statuscode >= 500 and statuscode < 600): + print "Server error, max retries exceeded." + print "Please resume the dump later." + print response.url + sys.exit(1) + def getNamespacesScraper(config={}, session=None): """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """ """ Function called if no API is available """ namespaces = config['namespaces'] namespacenames = {0:''} # main is 0, no prefix if namespaces: - r = session.post(url=config['index'], data={'title': 'Special:Allpages', }, headers={'User-Agent': getUserAgent()}) + r = session.post(url=config['index'], data={'title': 'Special:Allpages'}) raw = r.text delay(config=config, session=session) @@ -109,7 +149,7 @@ def getNamespacesAPI(config={}, session=None): namespaces = config['namespaces'] namespacenames = {0:''} # main is 0, no prefix if namespaces: - r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}, headers={'User-Agent': getUserAgent()}) + r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}) result = json.loads(r.text) delay(config=config, session=session) @@ -148,12 +188,12 @@ def getPageTitlesAPI(config={}, session=None): c = 0 print ' Retrieving titles in the namespace %d' % (namespace) - headers = {'User-Agent': getUserAgent()} apfrom = '!' while apfrom: sys.stderr.write('.') #progress params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} - r = session.post(url=config['api'], data=params, headers=headers) + r = session.post(url=config['api'], data=params) + handleStatusCode(r) #FIXME Handle HTTP errors here! jsontitles = json.loads(r.text) apfrom = '' @@ -182,7 +222,7 @@ def getPageTitlesScraper(config={}, session=None): for namespace in namespaces: print ' Retrieving titles in the namespace', namespace url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) - r = session.get(url=url, headers={'User-Agent': getUserAgent()}) + r = session.get(url=url) raw = r.text raw = cleanHTML(raw) @@ -219,7 +259,7 @@ def getPageTitlesScraper(config={}, session=None): if not name in checked_suballpages: checked_suballpages.append(name) #to avoid reload dupe subpages links delay(config=config, session=session) - r2 = session.get(url=url, headers={'User-Agent': getUserAgent()}) + r2 = session.get(url=url) raw2 = r2.text raw2 = cleanHTML(raw2) rawacum += raw2 #merge it after removed junk @@ -327,6 +367,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None): return '' # empty xml #FIXME HANDLE HTTP Errors HERE r = session.post(url=config['index'], data=params, headers=headers) + handleStatusCode(r) xml = r.text c += 1 @@ -343,8 +384,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None): title_ = title title_ = re.sub(' ', '_', title_) #do not convert & into %26, title_ = re.sub('&', '%26', title_) - headers = {'User-Agent': getUserAgent()} - params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit', } + params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'} if config['curonly']: params['curonly'] = 1 params['limit'] = 1 @@ -354,7 +394,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None): if config.has_key('templates') and config['templates']: #in other case, do not set params['templates'] params['templates'] = 1 - xml = getXMLPageCore(headers=headers, params=params, config=config, session=session) + xml = getXMLPageCore(params=params, config=config, session=session) #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available #else, warning about Special:Export truncating large page histories @@ -362,7 +402,7 @@ def getXMLPage(config={}, title='', verbose=True, session=None): if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one while not truncated and params['offset']: #next chunk params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML - xml2 = getXMLPageCore(headers=headers, params=params, config=config, session=session) + xml2 = getXMLPageCore(params=params, config=config, session=session) if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>? if re.findall(r_timestamp, xml2)[-1] == params['offset']: @@ -498,7 +538,7 @@ def getImageFilenamesURL(config={}, session=None): retries = 5 while offset: #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset, }, headers={'User-Agent': getUserAgent()}) + r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset}) raw = r.text delay(config=config, session=session) if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki @@ -573,14 +613,14 @@ def getImageFilenamesURLAPI(config={}, session=None): """ Retrieve file list: filename, url, uploader """ print 'Retrieving image filenames' - headers = {'User-Agent': getUserAgent()} aifrom = '!' images = [] while aifrom: sys.stderr.write('.') #progress params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} #FIXME Handle HTTP Errors HERE - r = session.post(url=config['api'], data=params, headers=headers) + r = session.post(url=config['api'], data=params) + handleStatusCode(r) jsonimages = json.loads(r.text) delay(config=config, session=session) aifrom = '' @@ -651,18 +691,11 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): # split last . (extension) and then merge filename2 = truncateFilename(other=other, filename=filename2) print 'Filename is too long, truncating. Now it is:', filename2 - # We need to set the user agent for urlretrieve but we can't do it in its call - # so we just override the class here; all I know about this method comes from - # http://docs.python.org/2/library/urllib.html#urllib._urlopener , - # http://docs.python.org/2/tutorial/classes.html#class-definition-syntax . - # TODO: Override the user agent for all functions in a more sensible place. - class URLopenerUserAgent(urllib.FancyURLopener): - version = "%s" % getUserAgent() - urllib._urlopener = URLopenerUserAgent() filename3 = u'%s/%s' % (imagepath, filename2) - urllib.urlretrieve(url=url, filename=filename3.encode('utf-8')) - # TODO: data=urllib.urlencode({}) removed image; request fails on wikipedia and POST neither works? - + imagefile = open(filename3, 'wb') + r = requests.get(url=url) + imagefile.write(r.content) + imagefile.close() #saving description if any xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility f = open('%s/%s.desc' % (imagepath, filename2), 'w') @@ -787,8 +820,9 @@ def getParameters(params=[]): parser.add_argument('-v', '--version', action='version', version=getVersion()) parser.add_argument('--cookies', metavar="cookies.txt", help="path to a cookies.txt file") parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)") + parser.add_argument('--retries', metavar=5, default=5, help="Maximum number of retries for ") parser.add_argument('--get-wiki-engine', action='store_true', help="returns the wiki engine") - + groupWikiOrAPIOrIndex = parser.add_mutually_exclusive_group(required=True) groupWikiOrAPIOrIndex.add_argument('wiki', default='', nargs='?', help="URL to wiki") groupWikiOrAPIOrIndex.add_argument('--api', help="URL to api.php") @@ -806,6 +840,9 @@ def getParameters(params=[]): parser.add_argument('--namespaces', metavar="1,2,3", help='comma-separated value of namespaces to include (all by default)') parser.add_argument('--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude') + parser.add_argument('--user', help='Username if authentication is required.') + parser.add_argument('--pass', dest='password', help='Password if authentication is required.') + args = parser.parse_args() #print args @@ -827,6 +864,12 @@ def getParameters(params=[]): print 'ERROR: URL to index.php must start with http:// or https://\n' parser.print_usage() sys.exit(1) + + # check user and pass (one requires both) + if (args.user and not args.password) or (args.password and not args.user): + print 'Both --user and --pass are required for authentication.' + parser.print_usage() + sys.exit(1) namespaces = ['all'] exnamespaces = [] @@ -877,6 +920,9 @@ def getParameters(params=[]): session = requests.Session() session.cookies = cj session.headers = {'User-Agent': getUserAgent()} + if args.user and args.password: + session.auth = (args.user, args.password) + #session.mount(args.api.split('/api.php')[0], HTTPAdapter(max_retries=max_ret)) config = { 'curonly': args.curonly, @@ -924,7 +970,7 @@ def getParameters(params=[]): def checkAPI(api, config={}, session=None): """ Checking API availability """ global cj - r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()}) + r = session.post(url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}) resultText = r.text print 'Checking api.php...', api if "MediaWiki API is not enabled for this site." in resultText: @@ -937,7 +983,7 @@ def checkAPI(api, config={}, session=None): def checkIndexphp(indexphp, config={}, session=None): """ Checking index.php availability """ - r = session.post(url=indexphp, data={'title': 'Special:Version'}, headers={'User-Agent': getUserAgent()}) + r = session.post(url=indexphp, data={'title': 'Special:Version'}) raw = r.text delay(config=config, session=session) print 'Checking index.php...', indexphp @@ -1021,7 +1067,7 @@ def resumePreviousDump(config={}, other={}): #load titles lasttitle = '' try: - f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r') + f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(config=config, session=other['session']), config['date']), 'r') raw = unicode(f.read(), 'utf-8') titles = raw.split('\n') lasttitle = titles[-1] @@ -1042,7 +1088,7 @@ def resumePreviousDump(config={}, other={}): xmliscomplete = False lastxmltitle = '' try: - f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r') + f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other['session']), config['date'], config['curonly'] and 'current' or 'history'), 'r') for l in f: if re.findall('</mediawiki>', l): #xml dump is complete @@ -1062,17 +1108,17 @@ def resumePreviousDump(config={}, other={}): elif lastxmltitle: #resuming... print 'Resuming XML dump from "%s"' % (lastxmltitle) - generateXMLDump(config=config, titles=titles, start=lastxmltitle) + generateXMLDump(config=config, titles=titles, start=lastxmltitle, session=other['session']) else: #corrupt? only has XML header? print 'XML is corrupt? Regenerating...' - generateXMLDump(config=config, titles=titles) + generateXMLDump(config=config, titles=titles, session=other['session']) if config['images']: #load images lastimage = '' try: - f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config, session=session), config['date']), 'r') + f = open('%s/%s-%s-images.txt' % (config['path'], domain2prefix(config=config), config['date']), 'r') raw = unicode(f.read(), 'utf-8').strip() lines = raw.split('\n') for l in lines: @@ -1088,9 +1134,9 @@ def resumePreviousDump(config={}, other={}): print 'Image list is incomplete. Reloading...' #do not resume, reload, to avoid inconsistences, deleted images or so if config['api']: - images=getImageFilenamesURLAPI(config=config, session=session) + images=getImageFilenamesURLAPI(config=config, session=other['session']) else: - images = getImageFilenamesURL(config=config, session=session) + images = getImageFilenamesURL(config=config, session=other['session']) saveImageFilenamesURL(config=config, images=images) #checking images directory listdir = [] @@ -1118,7 +1164,7 @@ def resumePreviousDump(config={}, other={}): #image dump is complete print 'Image dump was completed in the previous session' else: - generateImageDump(config=config, other=other, images=images, start=lastfilename2) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort + generateImageDump(config=config, other=other, images=images, start=lastfilename2, session=other['session']) # we resume from previous image, which may be corrupted (or missing .desc) by the previous session ctrl-c or abort if config['logs']: #fix @@ -1131,7 +1177,7 @@ def saveSpecialVersion(config={}, session=None): print 'Special:Version.html exists, do not overwrite' else: print 'Downloading Special:Version with extensions and other related info' - r = session.post(url=config['index'], data={'title': 'Special:Version', }, headers={'User-Agent': getUserAgent()}) + r = session.post(url=config['index'], data={'title': 'Special:Version'}) raw = r.text delay(config=config, session=session) raw = removeIP(raw=raw) @@ -1146,7 +1192,7 @@ def saveIndexPHP(config={}, session=None): print 'index.html exists, do not overwrite' else: print 'Downloading index.php (Main Page) as index.html' - r = session.post(url=config['index'], data={}, headers={'User-Agent': getUserAgent()}) + r = session.post(url=config['index'], data={}) raw = r.text delay(config=config, session=session) raw = removeIP(raw=raw) @@ -1162,7 +1208,7 @@ def saveSiteInfo(config={}, session=None): print 'siteinfo.json exists, do not overwrite' else: print 'Downloading site info as siteinfo.json' - r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'}, headers={'User-Agent': getUserAgent()}) + r = session.post(url=config['api'], data = {'action': 'query', 'meta': 'siteinfo', 'format': 'json'}) result = json.loads(r.text) delay(config=config, session=session) f = open('%s/siteinfo.json' % (config['path']), 'w') @@ -1205,7 +1251,6 @@ def main(params=[]): """ Main function """ configfilename = 'config.txt' - session = requests.Session() config, other = getParameters(params=params) avoidWikimediaProjects(config=config, other=other) From 1391cf893b71863360d71abb3377188009d5324e Mon Sep 17 00:00:00 2001 From: balr0g <balrog032@gmail.com> Date: Tue, 1 Jul 2014 14:04:56 -0400 Subject: [PATCH 3/3] Update requirements file --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 3973384..a5ff9bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ argparse>=1.2.1 +requests>=2.3.0