diff --git a/dumpgenerator.py b/dumpgenerator.py index 2c4f743..7514e6a 100644 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -7,12 +7,12 @@ # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see . @@ -40,11 +40,11 @@ try: except ImportError: print "Please install or update the Requests module." sys.exit(1) -import subprocess import time import urllib -__VERSION__ = '0.2.2' #major, minor, micro +__VERSION__ = '0.2.2' # major, minor, micro + def getVersion(): return(__VERSION__) @@ -54,23 +54,28 @@ def truncateFilename(other={}, filename=''): """ Truncate filenames when downloading images with large filenames """ return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1] + def delay(config={}, session=None): """ Add a delay if configured for that """ if config['delay'] > 0: print 'Sleeping... %d seconds...' % (config['delay']) time.sleep(config['delay']) + def cleanHTML(raw=''): """ Extract only the real wiki content and remove rubbish """ """ This function is ONLY used to retrieve page titles and file names when no API is available """ """ DO NOT use this function to extract page content """ - #different "tags" used by different MediaWiki versions to mark where starts and ends content + # different "tags" used by different MediaWiki versions to mark where + # starts and ends content if re.search('', raw): raw = raw.split('')[1].split('')[0] elif re.search('', raw): - raw = raw.split('')[1].split('')[0] + raw = raw.split( + '')[1].split('')[0] elif re.search('', raw): - raw = raw.split('')[1].split('')[0] + raw = raw.split( + '')[1].split('')[0] elif re.search('', raw): raw = raw.split('')[1].split('')[0] elif re.search('
', raw): @@ -83,6 +88,7 @@ def cleanHTML(raw=''): sys.exit() return raw + def handleStatusCode(response): statuscode = response.status_code if statuscode >= 200 and statuscode < 300: @@ -115,58 +121,66 @@ def handleStatusCode(response): print response.url sys.exit(1) + def getNamespacesScraper(config={}, session=None): """ Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages """ """ Function called if no API is available """ namespaces = config['namespaces'] - namespacenames = {0:''} # main is 0, no prefix + namespacenames = {0: ''} # main is 0, no prefix if namespaces: - r = session.post(url=config['index'], data={'title': 'Special:Allpages'}) + r = session.post( + url=config['index'], data={'title': 'Special:Allpages'}) raw = r.text delay(config=config, session=session) - m = re.compile(r'').finditer(raw) # [^>]*? to include selected="selected" + # [^>]*? to include selected="selected" + m = re.compile( + r'').finditer(raw) if 'all' in namespaces: namespaces = [] for i in m: namespaces.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") + namespacenames[int(i.group("namespaceid"))] = i.group( + "namespacename") else: - #check if those namespaces really exist in this wiki + # check if those namespaces really exist in this wiki namespaces2 = [] for i in m: if int(i.group("namespaceid")) in namespaces: namespaces2.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") + namespacenames[int(i.group("namespaceid"))] = i.group( + "namespacename") namespaces = namespaces2 else: namespaces = [0] - - namespaces = list(set(namespaces)) #uniques + + namespaces = list(set(namespaces)) # uniques print '%d namespaces found' % (len(namespaces)) return namespaces, namespacenames - + + def getNamespacesAPI(config={}, session=None): """ Uses the API to get the list of namespaces names and ids """ namespaces = config['namespaces'] - namespacenames = {0:''} # main is 0, no prefix + namespacenames = {0: ''} # main is 0, no prefix if namespaces: - r = session.post(url=config['api'], data={'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}) + r = session.post(url=config['api'], data={ + 'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}) result = json.loads(r.text) delay(config=config, session=session) if 'all' in namespaces: namespaces = [] for i in result['query']['namespaces'].keys(): - if int(i) < 0: # -1: Special, -2: Media, excluding + if int(i) < 0: # -1: Special, -2: Media, excluding continue namespaces.append(int(i)) namespacenames[int(i)] = result['query']['namespaces'][i]['*'] else: - #check if those namespaces really exist in this wiki + # check if those namespaces really exist in this wiki namespaces2 = [] for i in result['query']['namespaces'].keys(): - if int(i) < 0: # -1: Special, -2: Media, excluding + if int(i) < 0: # -1: Special, -2: Media, excluding continue if int(i) in namespaces: namespaces2.append(int(i)) @@ -174,41 +188,46 @@ def getNamespacesAPI(config={}, session=None): namespaces = namespaces2 else: namespaces = [0] - - namespaces = list(set(namespaces)) #uniques + + namespaces = list(set(namespaces)) # uniques print '%d namespaces found' % (len(namespaces)) return namespaces, namespacenames + def getPageTitlesAPI(config={}, session=None): """ Uses the API to get the list of page titles """ titles = [] - namespaces, namespacenames = getNamespacesAPI(config=config, session=session) + namespaces, namespacenames = getNamespacesAPI( + config=config, session=session) for namespace in namespaces: if namespace in config['exnamespaces']: print ' Skipping namespace = %d' % (namespace) continue - + c = 0 print ' Retrieving titles in the namespace %d' % (namespace) apfrom = '!' while apfrom: - sys.stderr.write('.') #progress - params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} + sys.stderr.write('.') # progress + params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, + 'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} r = session.post(url=config['api'], data=params) handleStatusCode(r) - #FIXME Handle HTTP errors here! + # FIXME Handle HTTP errors here! jsontitles = json.loads(r.text) apfrom = '' - if jsontitles.has_key('query-continue') and jsontitles['query-continue'].has_key('allpages'): - if jsontitles['query-continue']['allpages'].has_key('apcontinue'): - apfrom = jsontitles['query-continue']['allpages']['apcontinue'] - elif jsontitles['query-continue']['allpages'].has_key('apfrom'): + if 'query-continue' in jsontitles and 'allpages' in jsontitles['query-continue']: + if 'apcontinue' in jsontitles['query-continue']['allpages']: + apfrom = jsontitles['query-continue']['allpages']['apcontinue'] + elif 'apfrom' in jsontitles['query-continue']['allpages']: apfrom = jsontitles['query-continue']['allpages']['apfrom'] - #print apfrom - #print jsontitles - titles += [page['title'] for page in jsontitles['query']['allpages']] + # print apfrom + # print jsontitles + titles += [page['title'] + for page in jsontitles['query']['allpages']] if len(titles) != len(set(titles)): - #probably we are in a loop, server returning dupe titles, stop it + # probably we are in a loop, server returning dupe titles, stop + # it print 'Probably a loop, finishing' titles = list(set(titles)) apfrom = '' @@ -217,17 +236,20 @@ def getPageTitlesAPI(config={}, session=None): print ' %d titles retrieved in the namespace %d' % (c, namespace) return titles + def getPageTitlesScraper(config={}, session=None): """ """ titles = [] - namespaces, namespacenames = getNamespacesScraper(config=config, session=session) + namespaces, namespacenames = getNamespacesScraper( + config=config, session=session) for namespace in namespaces: print ' Retrieving titles in the namespace', namespace - url = '%s?title=Special:Allpages&namespace=%s' % (config['index'], namespace) + url = '%s?title=Special:Allpages&namespace=%s' % ( + config['index'], namespace) r = session.get(url=url) raw = r.text raw = cleanHTML(raw) - + r_title = r'title="(?P[^>]+)">' r_suballpages = '' r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">' @@ -237,177 +259,212 @@ def getPageTitlesScraper(config={}, session=None): elif re.search(r_suballpages2, raw): r_suballpages = r_suballpages2 else: - pass #perhaps no subpages - - deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels + pass # perhaps no subpages + + # 3 is the current deep of English Wikipedia for Special:Allpages, 3 + # levels + deep = 3 c = 0 checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: - #load sub-Allpages + # load sub-Allpages m = re.compile(r_suballpages).finditer(raw) for i in m: fr = i.group('from') - + if r_suballpages == r_suballpages1: to = i.group('to') name = '%s-%s' % (fr, to) - url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (config['index'], namespace, fr, to) #do not put urllib.quote in fr or to - elif r_suballpages == r_suballpages2: #fix, esta regexp no carga bien todas? o falla el r_title en este tipo de subpag? (wikiindex) - fr = fr.split('&namespace=')[0] #clean &namespace=\d, sometimes happens + url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % ( + config['index'], namespace, fr, to) # do not put urllib.quote in fr or to + # fix, esta regexp no carga bien todas? o falla el r_title en + # este tipo de subpag? (wikiindex) + elif r_suballpages == r_suballpages2: + # clean &namespace=\d, sometimes happens + fr = fr.split('&namespace=')[0] name = fr - url = '%s?title=Special:Allpages/%s&namespace=%s' % (config['index'], name, namespace) - - if not name in checked_suballpages: - checked_suballpages.append(name) #to avoid reload dupe subpages links + url = '%s?title=Special:Allpages/%s&namespace=%s' % ( + config['index'], name, namespace) + + if name not in checked_suballpages: + # to avoid reload dupe subpages links + checked_suballpages.append(name) delay(config=config, session=session) r2 = session.get(url=url) raw2 = r2.text raw2 = cleanHTML(raw2) - rawacum += raw2 #merge it after removed junk + rawacum += raw2 # merge it after removed junk print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages' delay(config=config, session=session) c += 1 - + c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = undoHTMLEntities(text=i.group('title')) if not t.startswith('Special:'): - if not t in titles: + if t not in titles: titles.append(t) c += 1 print ' %d titles retrieved in the namespace %d' % (c, namespace) return titles + def getPageTitles(config={}, session=None): """ Get list of page titles """ - #http://en.wikipedia.org/wiki/Special:AllPages - #http://archiveteam.org/index.php?title=Special:AllPages - #http://www.wikanda.es/wiki/Especial:Todas + # http://en.wikipedia.org/wiki/Special:AllPages + # http://archiveteam.org/index.php?title=Special:AllPages + # http://www.wikanda.es/wiki/Especial:Todas print 'Loading page titles from namespaces = %s' % (config['namespaces'] and ','.join([str(i) for i in config['namespaces']]) or 'None') print 'Excluding titles from namespaces = %s' % (config['exnamespaces'] and ','.join([str(i) for i in config['exnamespaces']]) or 'None') - + titles = [] if config['api']: titles = getPageTitlesAPI(config=config, session=session) elif config['index']: titles = getPageTitlesScraper(config=config, session=session) - - titles = list(set(titles)) #removing dupes (e.g. in CZ appears Widget:AddThis two times (main namespace and widget namespace)) - titles.sort() #sorting - + + # removing dupes (e.g. in CZ appears Widget:AddThis two times (main + # namespace and widget namespace)) + titles = list(set(titles)) + titles.sort() # sorting + print '%d page titles loaded' % (len(titles)) return titles + def getXMLHeader(config={}, session=None): """ Retrieve a random page to extract XML headers (namespace info, etc) """ - #get the header of a random page, to attach it in the complete XML backup - #similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" xmlns:x.... - randomtitle = 'Main_Page' #previously AMF5LKE43MNFGHKSDMRTJ - xml = getXMLPage(config=config, title=randomtitle, verbose=False, session=session) + # get the header of a random page, to attach it in the complete XML backup + # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" + # xmlns:x.... + randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ + xml = getXMLPage( + config=config, title=randomtitle, verbose=False, session=session) header = xml.split('</mediawiki>')[0] if not xml: print 'XML export on this wiki is broken, quitting.' sys.exit() return header + def getXMLFileDesc(config={}, title='', session=None): """ Get XML for image description page """ - config['curonly'] = 1 #tricky to get only the most recent desc + config['curonly'] = 1 # tricky to get only the most recent desc return getXMLPage(config=config, title=title, verbose=False, session=session) + def getUserAgent(): """ Return a cool user-agent to hide Python user-agent """ useragents = [ - #firefox - 'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0', + # firefox + 'Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Firefox/24.0', 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0', - ] + ] return useragents[0] + def logerror(config={}, text=''): """ Log error in file """ if text: with open('%s/errors.log' % (config['path']), 'a') as outfile: - output = u'%s: %s\n' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text) + output = u'%s: %s\n' % ( + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text) outfile.write(output.encode('utf-8')) + def getXMLPageCore(headers={}, params={}, config={}, session=None): """ """ - #returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki> - #if retrieving params['limit'] revisions fails, returns a current only version - #if all fail, returns the empty string + # returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki> + # if retrieving params['limit'] revisions fails, returns a current only version + # if all fail, returns the empty string xml = '' c = 0 - maxseconds = 100 #max seconds to wait in a single sleeping - maxretries = 5 # x retries and skip - increment = 20 #increment every retry + maxseconds = 100 # max seconds to wait in a single sleeping + maxretries = 5 # x retries and skip + increment = 20 # increment every retry while not re.search(r'</mediawiki>', xml): if c > 0 and c < maxretries: - wait = increment * c < maxseconds and increment * c or maxseconds # incremental until maxseconds + wait = increment * c < maxseconds and increment * \ + c or maxseconds # incremental until maxseconds print ' XML for "%s" is wrong. Waiting %d seconds and reloading...' % (params['pages'], wait) time.sleep(wait) - if params['limit'] > 1: # reducing server load requesting smallest chunks (if curonly then limit = 1 from mother function) - params['limit'] = params['limit'] / 2 # half + # reducing server load requesting smallest chunks (if curonly then + # limit = 1 from mother function) + if params['limit'] > 1: + params['limit'] = params['limit'] / 2 # half if c >= maxretries: print ' We have retried %d times' % (c) print ' MediaWiki error for "%s", network error or whatever...' % (params['pages']) # If it's not already what we tried: our last chance, preserve only the last revision... # config['curonly'] means that the whole dump is configured to save nonly the last - # params['curonly'] should mean that we've already tried this fallback, because it's set by the following if and passed to getXMLPageCore - if not config['curonly']: + # params['curonly'] should mean that we've already tried this + # fallback, because it's set by the following if and passed to + # getXMLPageCore + if not config['curonly']: print ' Trying to save only the last revision for this page...' params['curonly'] = 1 - logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % (params['pages'])) + logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % ( + params['pages'])) return getXMLPageCore(headers=headers, params=params, config=config) else: print ' Saving in the errors log, and skipping...' - logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % (params['pages'])) - return '' # empty xml - #FIXME HANDLE HTTP Errors HERE + logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % ( + params['pages'])) + return '' # empty xml + # FIXME HANDLE HTTP Errors HERE r = session.post(url=config['index'], data=params, headers=headers) handleStatusCode(r) xml = r.text c += 1 - + return xml + def getXMLPage(config={}, title='', verbose=True, session=None): """ Get the full history (or current only) of a page """ - #if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated - #http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F - + # if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated + # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F + limit = 1000 truncated = False title_ = title title_ = re.sub(' ', '_', title_) - #do not convert & into %26, title_ = re.sub('&', '%26', title_) + # do not convert & into %26, title_ = re.sub('&', '%26', title_) params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'} if config['curonly']: params['curonly'] = 1 params['limit'] = 1 else: - params['offset'] = '1' # 1 always < 2000s + params['offset'] = '1' # 1 always < 2000s params['limit'] = limit - if config.has_key('templates') and config['templates']: #in other case, do not set params['templates'] + # in other case, do not set params['templates'] + if 'templates' in config and config['templates']: params['templates'] = 1 - + xml = getXMLPageCore(params=params, config=config, session=session) - #if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available - #else, warning about Special:Export truncating large page histories + # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available + # else, warning about Special:Export truncating large page histories r_timestamp = r'<timestamp>([^<]+)</timestamp>' - if not config['curonly'] and re.search(r_timestamp, xml): # search for timestamps in xml to avoid analysing empty pages like Special:Allpages and the random one - while not truncated and params['offset']: #next chunk - params['offset'] = re.findall(r_timestamp, xml)[-1] #get the last timestamp from the acum XML - xml2 = getXMLPageCore(params=params, config=config, session=session) - - if re.findall(r_timestamp, xml2): #are there more edits in this next XML chunk or no <page></page>? + # search for timestamps in xml to avoid analysing empty pages like + # Special:Allpages and the random one + if not config['curonly'] and re.search(r_timestamp, xml): + while not truncated and params['offset']: # next chunk + # get the last timestamp from the acum XML + params['offset'] = re.findall(r_timestamp, xml)[-1] + xml2 = getXMLPageCore( + params=params, config=config, session=session) + + # are there more edits in this next XML chunk or no <page></page>? + if re.findall(r_timestamp, xml2): if re.findall(r_timestamp, xml2)[-1] == params['offset']: - #again the same XML, this wiki does not support params in Special:Export, offer complete XML up to X edits (usually 1000) + # again the same XML, this wiki does not support params in + # Special:Export, offer complete XML up to X edits (usually + # 1000) print 'ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated' truncated = True break @@ -423,47 +480,53 @@ def getXMLPage(config={}, title='', verbose=True, session=None): <timestamp>2011-03-09T19:57:06Z</timestamp> <contributor> """ - #offset is OK in this wiki, merge with the previous chunk of this page history and continue - xml = xml.split('</page>')[0] + ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:])) + # offset is OK in this wiki, merge with the previous chunk + # of this page history and continue + xml = xml.split( + '</page>')[0] + ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:])) else: - params['offset'] = '' #no more edits in this page history - + params['offset'] = '' # no more edits in this page history + if verbose: numberofedits = len(re.findall(r_timestamp, xml)) if (numberofedits == 1): print ' %s, 1 edit' % (title) else: print ' %s, %d edits' % (title, numberofedits) - + return xml + def cleanXML(xml=''): """ Trim redundant info """ - #do not touch XML codification, leave AS IS + # do not touch XML codification, leave AS IS if re.search(r'</siteinfo>\n', xml) and re.search(r'</mediawiki>', xml): xml = xml.split('</siteinfo>\n')[1] xml = xml.split('</mediawiki>')[0] return xml + def generateXMLDump(config={}, titles=[], start='', session=None): """ Generates a XML dump for a list of titles """ - + print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') header = getXMLHeader(config=config, session=session) - footer = '</mediawiki>\n' #new line at the end - xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), config['date'], config['curonly'] and 'current' or 'history') + footer = '</mediawiki>\n' # new line at the end + xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), + config['date'], config['curonly'] and 'current' or 'history') xmlfile = '' lock = True if start: - #remove the last chunk of xml dump (it is probably incomplete) + # remove the last chunk of xml dump (it is probably incomplete) xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r') xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w') prev = '' c = 0 for l in xmlfile: - #removing <page>\n until end of file - if c != 0: #lock to avoid write an empty line at the begining of file - if not re.search(r'<title>%s' % (start), l): + # removing \n until end of file + # lock to avoid write an empty line at the begining of file + if c != 0: + if not re.search(r'%s' % (start), l): xmlfile2.write(prev) else: break @@ -471,22 +534,25 @@ def generateXMLDump(config={}, titles=[], start='', session=None): prev = l xmlfile.close() xmlfile2.close() - #subst xml with xml2 - os.remove('%s/%s' % (config['path'], xmlfilename)) #remove previous xml dump - os.rename('%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) #move correctly truncated dump to its real name + # subst xml with xml2 + # remove previous xml dump + os.remove('%s/%s' % (config['path'], xmlfilename)) + # move correctly truncated dump to its real name + os.rename( + '%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) else: - #requested complete xml dump + # requested complete xml dump lock = False xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w') xmlfile.write(header.encode('utf-8')) xmlfile.close() - + xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') c = 1 for title in titles: if not title.strip(): continue - if title == start: #start downloading from start, included + if title == start: # start downloading from start, included lock = False if lock: continue @@ -496,38 +562,46 @@ def generateXMLDump(config={}, titles=[], start='', session=None): xml = getXMLPage(config=config, title=title, session=session) xml = cleanXML(xml=xml) if not xml: - logerror(config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title)) - #here, XML is a correct chunk or - #an empty string due to a deleted page (logged in errors log) or - #an empty string due to an error while retrieving the page from server (logged in errors log) + logerror( + config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title)) + # here, XML is a correct chunk or + # an empty string due to a deleted page (logged in errors log) or + # an empty string due to an error while retrieving the page from server + # (logged in errors log) xmlfile.write(xml.encode('utf-8')) c += 1 xmlfile.write(footer) xmlfile.close() print 'XML dump saved at...', xmlfilename + def saveTitles(config={}, titles=[]): """ Save title list in a file """ - titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date']) + titlesfilename = '%s-%s-titles.txt' % ( + domain2prefix(config=config), config['date']) titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w') output = u"%s\n--END--" % ('\n'.join(titles)) titlesfile.write(output.encode('utf-8')) titlesfile.close() - + print 'Titles saved at...', titlesfilename + def saveImageFilenamesURL(config={}, images=[], session=None): """ Save image list in a file, including filename, url and uploader """ - imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date']) + imagesfilename = '%s-%s-images.txt' % ( + domain2prefix(config=config), config['date']) imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') - imagesfile.write(('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]).encode('utf-8'))) + imagesfile.write(('\n'.join(['%s\t%s\t%s' % ( + filename, url, uploader) for filename, url, uploader in images]).encode('utf-8'))) imagesfile.write('\n--END--') imagesfile.close() - + print 'Image filenames and URLs saved at...', imagesfilename + def curateImageURL(config={}, url=''): """ Returns an absolute URL for an image, adding the domain if missing """ @@ -552,43 +626,54 @@ def curateImageURL(config={}, url=''): return url + def getImageFilenamesURL(config={}, session=None): """ Retrieve file list: filename, url, uploader """ - + print 'Retrieving image filenames' - r_next = r'(?\d+)&' # (?\d+)&' images = [] - offset = '29990101000000' #january 1, 2999 + offset = '29990101000000' # january 1, 2999 limit = 5000 retries = 5 while offset: - #5000 overload some servers, but it is needed for sites like this with no next links http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - r = session.post(url=config['index'], data={'title': 'Special:Imagelist', 'limit': limit, 'offset': offset}) + # 5000 overload some servers, but it is needed for sites like this with + # no next links + # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= + r = session.post(url=config['index'], data={ + 'title': 'Special:Imagelist', 'limit': limit, 'offset': offset}) raw = r.text delay(config=config, session=session) - if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki + # delicate wiki + if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if limit > 10: print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit) - limit = limit/10 + limit = limit / 10 continue - elif retries > 0: # waste retries, then exit + elif retries > 0: # waste retries, then exit retries -= 1 print 'Retrying...' continue else: print 'No more retries, exit...' break - + raw = cleanHTML(raw) - #archiveteam 1.15.1 Yahoovideo.jpg (file) - #wikanda 1.15.5 Fernandocg + # archiveteam 1.15.1 Yahoovideo.jpg (file) + # wikanda 1.15.5 Fernandocg r_images1 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' - #wikijuegos 1.9.5 http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old mediawiki version + # wikijuegos 1.9.5 + # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old + # mediawiki version r_images2 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' - #gentoowiki 1.18 18:15, 3 April 2011Asus eeepc-1201nl.png (file)37 KBYannails 1 + # gentoowiki 1.18 r_images3 = r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' - #http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= - #(desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
+ # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= + # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
r_images4 = r'(?im)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' r_images5 = (r'(?im)\s*]*?>(?P[^>]+)\s*\([^<]*?\s*\)\s*\s*' '[^\n\r]*?\s*' @@ -618,33 +703,35 @@ def getImageFilenamesURL(config={}, session=None): uploader = undoHTMLEntities(text=uploader) uploader = urllib.unquote(uploader) images.append([filename, url, uploader]) - #print filename, url - + # print filename, url + if re.search(r_next, raw): offset = re.findall(r_next, raw)[0] - retries += 5 # add more retries if we got a page with offset + retries += 5 # add more retries if we got a page with offset else: offset = '' - + if (len(images) == 1): print ' Found 1 image' else: print ' Found %d images' % (len(images)) - + images.sort() return images + def getImageFilenamesURLAPI(config={}, session=None): """ Retrieve file list: filename, url, uploader """ - + print 'Retrieving image filenames' oldAPI = False aifrom = '!' images = [] while aifrom: - sys.stderr.write('.') #progress - params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} - #FIXME Handle HTTP Errors HERE + sys.stderr.write('.') # progress + params = {'action': 'query', 'list': 'allimages', 'aiprop': + 'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} + # FIXME Handle HTTP Errors HERE r = session.post(url=config['api'], data=params) handleStatusCode(r) jsonimages = json.loads(r.text) @@ -708,40 +795,45 @@ def getImageFilenamesURLAPI(config={}, session=None): images.sort() return images + def undoHTMLEntities(text=''): """ Undo some HTML codes """ - - text = re.sub('<', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp + + # i guess only < > & " ' need conversion + # http://www.w3schools.com/html/html_entities.asp + text = re.sub('<', '<', text) text = re.sub('>', '>', text) text = re.sub('&', '&', text) text = re.sub('"', '"', text) text = re.sub(''', '\'', text) - + return text + def generateImageDump(config={}, other={}, images=[], start='', session=None): """ Save files and descriptions using a file list """ - - #fix use subdirectories md5 + + # fix use subdirectories md5 print 'Retrieving images from "%s"' % (start and start or 'start') imagepath = '%s/images' % (config['path']) if not os.path.isdir(imagepath): print 'Creating "%s" directory' % (imagepath) os.makedirs(imagepath) - + c = 0 lock = True if not start: lock = False for filename, url, uploader in images: - if filename == start: #start downloading from start (included) + if filename == start: # start downloading from start (included) lock = False if lock: continue delay(config=config, session=session) - - #saving file - #truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash limit). Later .desc is added to filename, so better 100 as max) + + # saving file + # truncate filename if length > 100 (100 + 32 (md5) = 132 < 143 (crash + # limit). Later .desc is added to filename, so better 100 as max) filename2 = urllib.unquote(filename) if len(filename2) > other['filenamelimit']: # split last . (extension) and then merge @@ -752,11 +844,13 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): r = requests.get(url=url) imagefile.write(r.content) imagefile.close() - #saving description if any - xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % (filename), session=session) # use Image: for backwards compatibility + # saving description if any + xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % ( + filename), session=session) # use Image: for backwards compatibility f = open('%s/%s.desc' % (imagepath, filename2), 'w') - if not re.search(r'', xmlfiledesc): #Banner featuring SG1, SGA, SGU teams - #failure when retrieving desc? then save it as empty .desc + # Banner featuring SG1, SGA, SGU teams + if not re.search(r'', xmlfiledesc): + # failure when retrieving desc? then save it as empty .desc xmlfiledesc = '' f.write(xmlfiledesc.encode('utf-8')) f.close() @@ -764,12 +858,13 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None): c += 1 if c % 10 == 0: print ' Downloaded %d images' % (c) - + print 'Downloaded %d images' % (c) - + + def saveLogs(config={}, session=None): """ Save Special:Log """ - #get all logs from Special:Log + # get all logs from Special:Log """parse