Merge pull request #194 from mrshu/mrshu/dumpgenerator-pep8fied

dumpgenerator: AutoPEP8-fied
pull/197/head
nemobis 10 years ago
commit b3ef165529

@ -52,7 +52,8 @@ def getVersion():
def truncateFilename(other={}, filename=''): def truncateFilename(other={}, filename=''):
""" Truncate filenames when downloading images with large filenames """ """ Truncate filenames when downloading images with large filenames """
return filename[:other['filenamelimit']] + md5(filename).hexdigest() + '.' + filename.split('.')[-1] return filename[:other['filenamelimit']] + \
md5(filename).hexdigest() + '.' + filename.split('.')[-1]
def delay(config={}, session=None): def delay(config={}, session=None):
@ -79,9 +80,11 @@ def cleanHTML(raw=''):
elif re.search('<!-- content -->', raw): elif re.search('<!-- content -->', raw):
raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0] raw = raw.split('<!-- content -->')[1].split('<!-- mw_content -->')[0]
elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw): elif re.search('<article id="WikiaMainContent" class="WikiaMainContent">', raw):
raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[1].split('</article>')[0] raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[
1].split('</article>')[0]
elif re.search('<body class=', raw): elif re.search('<body class=', raw):
raw = raw.split('<body class=')[1].split('<div class="printfooter">')[0] raw = raw.split('<body class=')[1].split(
'<div class="printfooter">')[0]
else: else:
print raw[:250] print raw[:250]
print 'This wiki doesn\'t use marks to split content' print 'This wiki doesn\'t use marks to split content'
@ -164,8 +167,14 @@ def getNamespacesAPI(config={}, session=None):
namespaces = config['namespaces'] namespaces = config['namespaces']
namespacenames = {0: ''} # main is 0, no prefix namespacenames = {0: ''} # main is 0, no prefix
if namespaces: if namespaces:
r = session.post(url=config['api'], data={ r = session.post(
'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'json'}) url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
)
result = json.loads(r.text) result = json.loads(r.text)
delay(config=config, session=session) delay(config=config, session=session)
@ -180,11 +189,13 @@ def getNamespacesAPI(config={}, session=None):
# check if those namespaces really exist in this wiki # check if those namespaces really exist in this wiki
namespaces2 = [] namespaces2 = []
for i in result['query']['namespaces'].keys(): for i in result['query']['namespaces'].keys():
if int(i) < 0: # -1: Special, -2: Media, excluding bi = i
i = int(i)
if i < 0: # -1: Special, -2: Media, excluding
continue continue
if int(i) in namespaces: if i in namespaces:
namespaces2.append(int(i)) namespaces2.append(i)
namespacenames[int(i)] = result['query']['namespaces'][i]['*'] namespacenames[i] = result['query']['namespaces'][bi]['*']
namespaces = namespaces2 namespaces = namespaces2
else: else:
namespaces = [0] namespaces = [0]
@ -209,16 +220,23 @@ def getPageTitlesAPI(config={}, session=None):
apfrom = '!' apfrom = '!'
while apfrom: while apfrom:
sys.stderr.write('.') # progress sys.stderr.write('.') # progress
params = {'action': 'query', 'list': 'allpages', 'apnamespace': namespace, params = {
'apfrom': apfrom.encode('utf-8'), 'format': 'json', 'aplimit': 500} 'action': 'query',
'list': 'allpages',
'apnamespace': namespace,
'apfrom': apfrom.encode('utf-8'),
'format': 'json',
'aplimit': 500}
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
# FIXME Handle HTTP errors here! # FIXME Handle HTTP errors here!
jsontitles = json.loads(r.text) jsontitles = json.loads(r.text)
apfrom = '' apfrom = ''
if 'query-continue' in jsontitles and 'allpages' in jsontitles['query-continue']: if 'query-continue' in jsontitles and 'allpages' in jsontitles[
'query-continue']:
if 'apcontinue' in jsontitles['query-continue']['allpages']: if 'apcontinue' in jsontitles['query-continue']['allpages']:
apfrom = jsontitles['query-continue']['allpages']['apcontinue'] apfrom = jsontitles[
'query-continue']['allpages']['apcontinue']
elif 'apfrom' in jsontitles['query-continue']['allpages']: elif 'apfrom' in jsontitles['query-continue']['allpages']:
apfrom = jsontitles['query-continue']['allpages']['apfrom'] apfrom = jsontitles['query-continue']['allpages']['apfrom']
# print apfrom # print apfrom
@ -299,7 +317,9 @@ def getPageTitlesScraper(config={}, session=None):
raw2 = r2.text raw2 = r2.text
raw2 = cleanHTML(raw2) raw2 = cleanHTML(raw2)
rawacum += raw2 # merge it after removed junk rawacum += raw2 # merge it after removed junk
print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages' print ' Reading', name, len(raw2), 'bytes', \
len(re.findall(r_suballpages, raw2)), 'subpages', \
len(re.findall(r_title, raw2)), 'pages'
delay(config=config, session=session) delay(config=config, session=session)
c += 1 c += 1
@ -338,8 +358,7 @@ def getPageTitles(config={}, session=None):
# removing dupes (e.g. in CZ appears Widget:AddThis two times (main # removing dupes (e.g. in CZ appears Widget:AddThis two times (main
# namespace and widget namespace)) # namespace and widget namespace))
titles = list(set(titles)) titles = sorted(set(titles))
titles.sort()
print '%d page titles loaded' % (len(titles)) print '%d page titles loaded' % (len(titles))
return titles return titles
@ -380,7 +399,12 @@ def getXMLHeader(config={}, session=None):
def getXMLFileDesc(config={}, title='', session=None): def getXMLFileDesc(config={}, title='', session=None):
""" Get XML for image description page """ """ Get XML for image description page """
config['curonly'] = 1 # tricky to get only the most recent desc config['curonly'] = 1 # tricky to get only the most recent desc
return getXMLPage(config=config, title=title, verbose=False, session=session) return getXMLPage(
config=config,
title=title,
verbose=False,
session=session
)
def getUserAgent(): def getUserAgent():
@ -433,20 +457,30 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if not config['curonly']: if not config['curonly']:
print ' Trying to save only the last revision for this page...' print ' Trying to save only the last revision for this page...'
params['curonly'] = 1 params['curonly'] = 1
logerror(config=config, text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % ( logerror(
params['pages'])) config=config,
return getXMLPageCore(headers=headers, params=params, config=config, session=session) text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' %
(params['pages'])
)
return getXMLPageCore(
headers=headers,
params=params,
config=config,
session=session
)
else: else:
print ' Saving in the errors log, and skipping...' print ' Saving in the errors log, and skipping...'
logerror(config=config, text='Error while retrieving the last revision of "%s". Skipping.' % ( logerror(
params['pages'])) config=config,
text='Error while retrieving the last revision of "%s". Skipping.' %
(params['pages']))
return '' # empty xml return '' # empty xml
# FIXME HANDLE HTTP Errors HERE # FIXME HANDLE HTTP Errors HERE
try: try:
r = session.post(url=config['index'], data=params, headers=headers) r = session.post(url=config['index'], data=params, headers=headers)
handleStatusCode(r) handleStatusCode(r)
xml = r.text xml = r.text
except requests.exceptions.ConnectionError, e: except requests.exceptions.ConnectionError as e:
xml = '' xml = ''
c += 1 c += 1
@ -543,7 +577,8 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
header = getXMLHeader(config=config, session=session) header = getXMLHeader(config=config, session=session)
footer = '</mediawiki>\n' # new line at the end footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config), xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
config['date'], config['curonly'] and 'current' or 'history') config['date'],
config['curonly'] and 'current' or 'history')
xmlfile = '' xmlfile = ''
lock = True lock = True
if start: if start:
@ -569,7 +604,10 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
os.remove('%s/%s' % (config['path'], xmlfilename)) os.remove('%s/%s' % (config['path'], xmlfilename))
# move correctly truncated dump to its real name # move correctly truncated dump to its real name
os.rename( os.rename(
'%s/%s2' % (config['path'], xmlfilename), '%s/%s' % (config['path'], xmlfilename)) '%s/%s2' %
(config['path'], xmlfilename), '%s/%s' %
(config['path'], xmlfilename)
)
else: else:
# requested complete xml dump # requested complete xml dump
lock = False lock = False
@ -593,7 +631,10 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xml = cleanXML(xml=xml) xml = cleanXML(xml=xml)
if not xml: if not xml:
logerror( logerror(
config=config, text=u'The page "%s" was missing in the wiki (probably deleted)' % (title)) config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title)
)
# here, XML is a correct <page> </page> chunk or # here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or # an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server # an empty string due to an error while retrieving the page from server
@ -624,8 +665,18 @@ def saveImageNames(config={}, images=[], session=None):
imagesfilename = '%s-%s-images.txt' % ( imagesfilename = '%s-%s-images.txt' % (
domain2prefix(config=config), config['date']) domain2prefix(config=config), config['date'])
imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
imagesfile.write(('\n'.join(['%s\t%s\t%s' % ( imagesfile.write(
filename, url, uploader) for filename, url, uploader in images]).encode('utf-8'))) ('\n'.join(
[
'%s\t%s\t%s' %
(filename,
url,
uploader) for filename,
url,
uploader in images]
).encode('utf-8')
)
)
imagesfile.write('\n--END--') imagesfile.write('\n--END--')
imagesfile.close() imagesfile.close()
@ -637,21 +688,26 @@ def curateImageURL(config={}, url=''):
if 'index' in config and config['index']: if 'index' in config and config['index']:
# remove from :// (http or https) until the first / after domain # remove from :// (http or https) until the first / after domain
domainalone = config['index'].split('://')[0] + '://' + config['index'].split('://')[1].split('/')[0] domainalone = config['index'].split(
'://')[0] + '://' + config['index'].split('://')[1].split('/')[0]
elif 'api' in config and config['api']: elif 'api' in config and config['api']:
domainalone = config['api'].split('://')[0] + '://' + config['api'].split('://')[1].split('/')[0] domainalone = config['api'].split(
'://')[0] + '://' + config['api'].split('://')[1].split('/')[0]
else: else:
print 'ERROR: no index nor API' print 'ERROR: no index nor API'
sys.exit() sys.exit()
if url.startswith('//'): # Orain wikifarm returns URLs starting with // if url.startswith('//'): # Orain wikifarm returns URLs starting with //
url = u'%s:%s' % (domainalone.split('://')[0], url) url = u'%s:%s' % (domainalone.split('://')[0], url)
elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL? # is it a relative URL?
elif url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')):
if url[0] == '/': # slash is added later if url[0] == '/': # slash is added later
url = url[1:] url = url[1:]
url = u'%s/%s' % (domainalone, url) # concat http(s) + domain + relative url # concat http(s) + domain + relative url
url = u'%s/%s' % (domainalone, url)
url = undoHTMLEntities(text=url) url = undoHTMLEntities(text=url)
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars # url = urllib.unquote(url) #do not use unquote with url, it break some
# urls with odd chars
url = re.sub(' ', '_', url) url = re.sub(' ', '_', url)
return url return url
@ -670,12 +726,18 @@ def getImageNamesScraper(config={}, session=None):
# 5000 overload some servers, but it is needed for sites like this with # 5000 overload some servers, but it is needed for sites like this with
# no next links # no next links
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
r = session.post(url=config['index'], data={ r = session.post(
'title': 'Special:Imagelist', 'limit': limit, 'offset': offset}) url=config['index'],
data={
'title': 'Special:Imagelist',
'limit': limit,
'offset': offset})
raw = r.text raw = r.text
delay(config=config, session=session) delay(config=config, session=session)
# delicate wiki # delicate wiki
if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): if re.search(
ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)',
raw):
if limit > 10: if limit > 10:
print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit) print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
limit = limit / 10 limit = limit / 10
@ -704,7 +766,8 @@ def getImageNamesScraper(config={}, session=None):
# http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
# (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br /> # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>' r_images4 = r'(?im)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
r_images5 = (r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*' r_images5 = (
r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>\s*\(<a href="(?P<url>[^>]+)">[^<]*?</a>\s*\)\s*</td>\s*'
'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*' '<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
'<td class="TablePager_col_img_size">[^<]*?</td>\s*' '<td class="TablePager_col_img_size">[^<]*?</td>\s*'
'<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>') '<td class="TablePager_col_img_user_text">\s*(<a href="[^>]*?" title="[^>]*?">)?(?P<uploader>[^<]+?)(</a>)?\s*</td>')
@ -761,8 +824,13 @@ def getImageNamesAPI(config={}, session=None):
images = [] images = []
while aifrom: while aifrom:
sys.stderr.write('.') # progress sys.stderr.write('.') # progress
params = {'action': 'query', 'list': 'allimages', 'aiprop': params = {
'url|user', 'aifrom': aifrom, 'format': 'json', 'ailimit': 500} 'action': 'query',
'list': 'allimages',
'aiprop': 'url|user',
'aifrom': aifrom,
'format': 'json',
'ailimit': 500}
# FIXME Handle HTTP Errors HERE # FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
@ -771,18 +839,23 @@ def getImageNamesAPI(config={}, session=None):
if 'query' in jsonimages: if 'query' in jsonimages:
aifrom = '' aifrom = ''
if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allimages'): if 'query-continue' in jsonimages and 'allimages' in jsonimages[
if jsonimages['query-continue']['allimages'].has_key('aicontinue'): 'query-continue']:
aifrom = jsonimages['query-continue']['allimages']['aicontinue'] if 'aicontinue' in jsonimages['query-continue']['allimages']:
elif jsonimages['query-continue']['allimages'].has_key('aifrom'): aifrom = jsonimages[
aifrom = jsonimages['query-continue']['allimages']['aifrom'] 'query-continue']['allimages']['aicontinue']
elif 'aifrom' in jsonimages['query-continue']['allimages']:
aifrom = jsonimages[
'query-continue']['allimages']['aifrom']
# print aifrom # print aifrom
for image in jsonimages['query']['allimages']: for image in jsonimages['query']['allimages']:
url = image['url'] url = image['url']
url = curateImageURL(config=config, url=url) url = curateImageURL(config=config, url=url)
# encoding to ascii is needed to work around this horrible bug: http://bugs.python.org/issue8136 # encoding to ascii is needed to work around this horrible bug:
filename = unicode(urllib.unquote((re.sub('_', ' ', url.split('/')[-1])).encode('ascii','ignore')), 'utf-8') # http://bugs.python.org/issue8136
filename = unicode(urllib.unquote(
(re.sub('_', ' ', url.split('/')[-1])).encode('ascii', 'ignore')), 'utf-8')
uploader = re.sub('_', ' ', image['user']) uploader = re.sub('_', ' ', image['user'])
images.append([filename, url, uploader]) images.append([filename, url, uploader])
else: else:
@ -796,8 +869,18 @@ def getImageNamesAPI(config={}, session=None):
sys.stderr.write('.') # progress sys.stderr.write('.') # progress
# Some old APIs doesn't have allimages query # Some old APIs doesn't have allimages query
# In this case use allpages (in nm=6) as generator for imageinfo # In this case use allpages (in nm=6) as generator for imageinfo
# Example: http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6 &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=! # Example:
params = {'action': 'query', 'generator': 'allpages', 'gapnamespace': 6, 'gaplimit': 500, 'gapfrom': gapfrom, 'prop': 'imageinfo', 'iiprop': 'user|url', 'format': 'json'} # http://minlingo.wiki-site.com/api.php?action=query&generator=allpages&gapnamespace=6
# &gaplimit=500&prop=imageinfo&iiprop=user|url&gapfrom=!
params = {
'action': 'query',
'generator': 'allpages',
'gapnamespace': 6,
'gaplimit': 500,
'gapfrom': gapfrom,
'prop': 'imageinfo',
'iiprop': 'user|url',
'format': 'json'}
# FIXME Handle HTTP Errors HERE # FIXME Handle HTTP Errors HERE
r = session.post(url=config['api'], data=params) r = session.post(url=config['api'], data=params)
handleStatusCode(r) handleStatusCode(r)
@ -806,16 +889,21 @@ def getImageNamesAPI(config={}, session=None):
if 'query' in jsonimages: if 'query' in jsonimages:
gapfrom = '' gapfrom = ''
if jsonimages.has_key('query-continue') and jsonimages['query-continue'].has_key('allpages'): if 'query-continue' in jsonimages and 'allpages' in jsonimages[
if jsonimages['query-continue']['allpages'].has_key('gapfrom'): 'query-continue']:
gapfrom = jsonimages['query-continue']['allpages']['gapfrom'] if 'gapfrom' in jsonimages['query-continue']['allpages']:
gapfrom = jsonimages[
'query-continue']['allpages']['gapfrom']
# print gapfrom # print gapfrom
# print jsonimages['query'] # print jsonimages['query']
for image, props in jsonimages['query']['pages'].items(): for image, props in jsonimages['query']['pages'].items():
url = props['imageinfo'][0]['url'] url = props['imageinfo'][0]['url']
url = curateImageURL(config=config, url=url) url = curateImageURL(config=config, url=url)
filename = re.sub('_', ' ', ':'.join(props['title'].split(':')[1:]))
tmp_filename = ':'.join(props['title'].split(':')[1:])
filename = re.sub('_', ' ', tmp_filename)
uploader = re.sub('_', ' ', props['imageinfo'][0]['user']) uploader = re.sub('_', ' ', props['imageinfo'][0]['user'])
images.append([filename, url, uploader]) images.append([filename, url, uploader])
@ -876,8 +964,11 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
imagefile.write(r.content) imagefile.write(r.content)
imagefile.close() imagefile.close()
# saving description if any # saving description if any
xmlfiledesc = getXMLFileDesc(config=config, title=u'Image:%s' % ( xmlfiledesc = getXMLFileDesc(
filename), session=session) # use Image: for backwards compatibility config=config,
title=u'Image:%s' %
(filename),
session=session) # use Image: for backwards compatibility
f = open('%s/%s.desc' % (imagepath, filename2), 'w') f = open('%s/%s.desc' % (imagepath, filename2), 'w')
# <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text> # <text xml:space="preserve" bytes="36">Banner featuring SG1, SGA, SGU teams</text>
if not re.search(r'</mediawiki>', xmlfiledesc): if not re.search(r'</mediawiki>', xmlfiledesc):
@ -1008,42 +1099,72 @@ def getParameters(params=[]):
parser.add_argument( parser.add_argument(
'--cookies', metavar="cookies.txt", help="path to a cookies.txt file") '--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
parser.add_argument( parser.add_argument(
'--delay', metavar=5, default=0, type=float, help="adds a delay (in seconds)") '--delay',
metavar=5,
default=0,
type=float,
help="adds a delay (in seconds)")
parser.add_argument( parser.add_argument(
'--retries', metavar=5, default=5, help="Maximum number of retries for ") '--retries',
metavar=5,
default=5,
help="Maximum number of retries for ")
parser.add_argument('--path', help='path to store wiki dump at') parser.add_argument('--path', help='path to store wiki dump at')
parser.add_argument('--resume', action='store_true', parser.add_argument(
'--resume',
action='store_true',
help='resumes previous incomplete dump (requires --path)') help='resumes previous incomplete dump (requires --path)')
parser.add_argument('--force', action='store_true', help='') parser.add_argument('--force', action='store_true', help='')
parser.add_argument( parser.add_argument(
'--user', help='Username if authentication is required.') '--user', help='Username if authentication is required.')
parser.add_argument( parser.add_argument(
'--pass', dest='password', help='Password if authentication is required.') '--pass',
dest='password',
help='Password if authentication is required.')
# URL params # URL params
groupWikiOrAPIOrIndex = parser.add_argument_group() groupWikiOrAPIOrIndex = parser.add_argument_group()
groupWikiOrAPIOrIndex.add_argument( groupWikiOrAPIOrIndex.add_argument(
'wiki', default='', nargs='?', help="URL to wiki (e.g. http://wiki.domain.org)") 'wiki',
groupWikiOrAPIOrIndex.add_argument('--api', help="URL to API (e.g. http://wiki.domain.org/w/api.php)") default='',
groupWikiOrAPIOrIndex.add_argument('--index', help="URL to index.php (e.g. http://wiki.domain.org/w/index.php)") nargs='?',
help="URL to wiki (e.g. http://wiki.domain.org)")
groupWikiOrAPIOrIndex.add_argument(
'--api',
help="URL to API (e.g. http://wiki.domain.org/w/api.php)")
groupWikiOrAPIOrIndex.add_argument(
'--index',
help="URL to index.php (e.g. http://wiki.domain.org/w/index.php)")
# Download params # Download params
groupDownload = parser.add_argument_group('Data to download', 'What info download from the wiki') groupDownload = parser.add_argument_group(
'Data to download',
'What info download from the wiki')
groupDownload.add_argument( groupDownload.add_argument(
'--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)") '--xml',
action='store_true',
help="generates a full history XML dump (--xml --curonly for current revisions only)")
groupDownload.add_argument('--curonly', action='store_true', groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages') help='store only the current version of pages')
groupDownload.add_argument( groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump") '--images', action='store_true', help="generates an image dump")
groupDownload.add_argument('--namespaces', metavar="1,2,3", groupDownload.add_argument(
'--namespaces',
metavar="1,2,3",
help='comma-separated value of namespaces to include (all by default)') help='comma-separated value of namespaces to include (all by default)')
groupDownload.add_argument('--exnamespaces', metavar="1,2,3", groupDownload.add_argument(
'--exnamespaces',
metavar="1,2,3",
help='comma-separated value of namespaces to exclude') help='comma-separated value of namespaces to exclude')
# Meta info params # Meta info params
groupMeta = parser.add_argument_group('Meta info', 'What meta info to retrieve from the wiki') groupMeta = parser.add_argument_group(
'Meta info',
'What meta info to retrieve from the wiki')
groupMeta.add_argument( groupMeta.add_argument(
'--get-wiki-engine', action='store_true', help="returns the wiki engine") '--get-wiki-engine',
action='store_true',
help="returns the wiki engine")
args = parser.parse_args() args = parser.parse_args()
# print args # print args
@ -1121,13 +1242,19 @@ def getParameters(params=[]):
print 'Error in API, please, provide a correct path to API' print 'Error in API, please, provide a correct path to API'
sys.exit(1) sys.exit(1)
if index and checkIndex(index=index, cookies=args.cookies, session=session): if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK' print 'index.php is OK'
else: else:
index = index2 index = index2
if index and index.startswith('//'): if index and index.startswith('//'):
index = args.wiki.split('//')[0] + index index = args.wiki.split('//')[0] + index
if index and checkIndex(index=index, cookies=args.cookies, session=session): if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK' print 'index.php is OK'
else: else:
print 'Error in index.php, please, provide a correct path to index.php' print 'Error in index.php, please, provide a correct path to index.php'
@ -1144,7 +1271,9 @@ def getParameters(params=[]):
# Process namespace inclusions # Process namespace inclusions
if args.namespaces: if args.namespaces:
# fix, why - ? and... --namespaces= all with a space works? # fix, why - ? and... --namespaces= all with a space works?
if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': if re.search(
r'[^\d, \-]',
args.namespaces) and args.namespaces.lower() != 'all':
print "Invalid namespace values.\nValid format is integer(s) separated by commas" print "Invalid namespace values.\nValid format is integer(s) separated by commas"
sys.exit() sys.exit()
else: else:
@ -1205,7 +1334,11 @@ def checkAPI(api=None, session=None):
""" Checking API availability """ """ Checking API availability """
global cj global cj
r = session.post( r = session.post(
url=api, data={'action': 'query', 'meta': 'siteinfo', 'format': 'json'}) url=api,
data={
'action': 'query',
'meta': 'siteinfo',
'format': 'json'})
resultText = r.text resultText = r.text
print 'Checking API...', api print 'Checking API...', api
if "MediaWiki API is not enabled for this site." in resultText: if "MediaWiki API is not enabled for this site." in resultText:
@ -1213,8 +1346,13 @@ def checkAPI(api=None, session=None):
try: try:
result = json.loads(resultText) result = json.loads(resultText)
if 'query' in result: if 'query' in result:
if 'general' in result['query'] and 'script' in result['query']['general'] and 'server' in result['query']['general']: query = result['query']
return (True, result['query']['general']['server']+result['query']['general']['script']) general = result['query']['general']
if 'general' in query and 'script' in general and 'server' in general:
return (
True,
result['query']['general']['server'] +
result['query']['general']['script'])
else: else:
return (True, None) return (True, None)
except ValueError: except ValueError:
@ -1228,10 +1366,14 @@ def checkIndex(index=None, cookies=None, session=None):
raw = r.text raw = r.text
print 'Checking index.php...', index print 'Checking index.php...', index
# Workaround for issue 71 # Workaround for issue 71
if re.search(r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)', raw) and not cookies: if re.search(
r'(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)',
raw) and not cookies:
print "ERROR: This wiki requires login and we are not authenticated" print "ERROR: This wiki requires login and we are not authenticated"
return False return False
if re.search(r'(This wiki is powered by|<h2 id="mw-version-license">|meta name="generator" content="MediaWiki)', raw): if re.search(
r'(This wiki is powered by|<h2 id="mw-version-license">|meta name="generator" content="MediaWiki)',
raw):
return True return True
return False return False
@ -1243,7 +1385,9 @@ def removeIP(raw=''):
# http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
# weird cases as :: are not included # weird cases as :: are not included
raw = re.sub( raw = re.sub(
r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw) r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}',
'0:0:0:0:0:0:0:0',
raw)
return raw return raw
@ -1258,7 +1402,15 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
checkpageclose = 0 checkpageclose = 0
checkrevisionopen = 0 checkrevisionopen = 0
checkrevisionclose = 0 checkrevisionclose = 0
for line in file('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=session), config['date'], config['curonly'] and 'current' or 'history'), 'r').read().splitlines(): for line in file(
'%s/%s-%s-%s.xml' %
(config['path'],
domain2prefix(
config=config,
session=session),
config['date'],
config['curonly'] and 'current' or 'history'),
'r').read().splitlines():
if "<revision>" in line: if "<revision>" in line:
checkrevisionopen += 1 checkrevisionopen += 1
elif "</revision>" in line: elif "</revision>" in line:
@ -1292,11 +1444,18 @@ def createNewDump(config={}, other={}):
titles += getPageTitles(config=config, session=other['session']) titles += getPageTitles(config=config, session=other['session'])
saveTitles(config=config, titles=titles) saveTitles(config=config, titles=titles)
generateXMLDump(config=config, titles=titles, session=other['session']) generateXMLDump(config=config, titles=titles, session=other['session'])
checkXMLIntegrity(config=config, titles=titles, session=other['session']) checkXMLIntegrity(
config=config,
titles=titles,
session=other['session'])
if config['images']: if config['images']:
images += getImageNames(config=config, session=other['session']) images += getImageNames(config=config, session=other['session'])
saveImageNames(config=config, images=images, session=other['session']) saveImageNames(config=config, images=images, session=other['session'])
generateImageDump(config=config, other=other, images=images, session=other['session']) generateImageDump(
config=config,
other=other,
images=images,
session=other['session'])
if config['logs']: if config['logs']:
saveLogs(config=config, session=other['session']) saveLogs(config=config, session=other['session'])
@ -1332,8 +1491,15 @@ def resumePreviousDump(config={}, other={}):
xmliscomplete = False xmliscomplete = False
lastxmltitle = '' lastxmltitle = ''
try: try:
f = open('%s/%s-%s-%s.xml' % (config['path'], domain2prefix(config=config, session=other[ f = open(
'session']), config['date'], config['curonly'] and 'current' or 'history'), 'r') '%s/%s-%s-%s.xml' %
(config['path'],
domain2prefix(
config=config,
session=other['session']),
config['date'],
config['curonly'] and 'current' or 'history'),
'r')
for l in f: for l in f:
if re.findall('</mediawiki>', l): if re.findall('</mediawiki>', l):
# xml dump is complete # xml dump is complete
@ -1355,7 +1521,10 @@ def resumePreviousDump(config={}, other={}):
# resuming... # resuming...
print 'Resuming XML dump from "%s"' % (lastxmltitle) print 'Resuming XML dump from "%s"' % (lastxmltitle)
generateXMLDump( generateXMLDump(
config=config, titles=titles, start=lastxmltitle, session=other['session']) config=config,
titles=titles,
start=lastxmltitle,
session=other['session'])
else: else:
# corrupt? only has XML header? # corrupt? only has XML header?
print 'XML is corrupt? Regenerating...' print 'XML is corrupt? Regenerating...'
@ -1366,8 +1535,13 @@ def resumePreviousDump(config={}, other={}):
# load images # load images
lastimage = '' lastimage = ''
try: try:
f = open('%s/%s-%s-images.txt' % f = open(
(config['path'], domain2prefix(config=config), config['date']), 'r') '%s/%s-%s-images.txt' %
(config['path'],
domain2prefix(
config=config),
config['date']),
'r')
raw = unicode(f.read(), 'utf-8').strip() raw = unicode(f.read(), 'utf-8').strip()
lines = raw.split('\n') lines = raw.split('\n')
for l in lines: for l in lines:
@ -1415,7 +1589,11 @@ def resumePreviousDump(config={}, other={}):
# we resume from previous image, which may be corrupted (or missing # we resume from previous image, which may be corrupted (or missing
# .desc) by the previous session ctrl-c or abort # .desc) by the previous session ctrl-c or abort
generateImageDump( generateImageDump(
config=config, other=other, images=images, start=lastfilename2, session=other['session']) config=config,
other=other,
images=images,
start=lastfilename2,
session=other['session'])
if config['logs']: if config['logs']:
# fix # fix
@ -1463,7 +1641,9 @@ def saveSiteInfo(config={}, session=None):
print 'Downloading site info as siteinfo.json' print 'Downloading site info as siteinfo.json'
# MediaWiki 1.13+ # MediaWiki 1.13+
r = session.post(url=config['api'], data={ r = session.post(
url=config['api'],
data={
'action': 'query', 'action': 'query',
'meta': 'siteinfo', 'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo',
@ -1471,15 +1651,22 @@ def saveSiteInfo(config={}, session=None):
'format': 'json'}) 'format': 'json'})
# MediaWiki 1.11-1.12 # MediaWiki 1.11-1.12
if not 'query' in json.loads(r.text): if not 'query' in json.loads(r.text):
r = session.post(url=config['api'], data={ r = session.post(
url=config['api'],
data={
'action': 'query', 'action': 'query',
'meta': 'siteinfo', 'meta': 'siteinfo',
'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap', 'siprop': 'general|namespaces|statistics|dbrepllag|interwikimap',
'format': 'json'}) 'format': 'json'})
# MediaWiki 1.8-1.10 # MediaWiki 1.8-1.10
if not 'query' in json.loads(r.text): if not 'query' in json.loads(r.text):
r = session.post(url=config['api'], data={ r = session.post(
'action': 'query', 'meta': 'siteinfo', 'siprop': 'general|namespaces', 'format': 'json'}) url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces',
'format': 'json'})
result = json.loads(r.text) result = json.loads(r.text)
delay(config=config, session=session) delay(config=config, session=session)
with open('%s/siteinfo.json' % (config['path']), 'w') as outfile: with open('%s/siteinfo.json' % (config['path']), 'w') as outfile:
@ -1490,7 +1677,10 @@ def avoidWikimediaProjects(config={}, other={}):
""" Skip Wikimedia projects and redirect to the dumps website """ """ Skip Wikimedia projects and redirect to the dumps website """
# notice about wikipedia dumps # notice about wikipedia dumps
if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api'] + config['index']): if re.findall(
r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org',
config['api'] +
config['index']):
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!' print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
print 'Download the dumps from http://dumps.wikimedia.org' print 'Download the dumps from http://dumps.wikimedia.org'
if not other['force']: if not other['force']:
@ -1509,7 +1699,9 @@ def getWikiEngine(url=''):
result = r.text result = r.text
wikiengine = 'Unknown' wikiengine = 'Unknown'
if re.search(ur'(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site', result): if re.search(
ur'(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site',
result):
wikiengine = 'DokuWiki' wikiengine = 'DokuWiki'
elif re.search(ur'(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki)', result): elif re.search(ur'(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki)', result):
wikiengine = 'MediaWiki' wikiengine = 'MediaWiki'
@ -1561,7 +1753,8 @@ def getWikiEngine(url=''):
elif re.search(ur'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result): elif re.search(ur'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result):
wikiengine = 'WackoWiki' wikiengine = 'WackoWiki'
elif re.search(ur'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result): elif re.search(ur'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result):
# This may not work for heavily modded/themed installations, e.g. http://operawiki.info/ # This may not work for heavily modded/themed installations, e.g.
# http://operawiki.info/
wikiengine = 'WakkaWiki' wikiengine = 'WakkaWiki'
# Custom wikis used by wiki farms # Custom wikis used by wiki farms
elif re.search(ur'(?im)(var wikispaces_page|<div class="WikispacesContent)', result): elif re.search(ur'(?im)(var wikispaces_page|<div class="WikispacesContent)', result):
@ -1589,7 +1782,9 @@ def mwGetAPIAndIndex(url=''):
result = r.text result = r.text
# API # API
m = re.findall(ur'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>', result) m = re.findall(
ur'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>',
result)
if m: if m:
api = m[0] api = m[0]
if api.startswith('//'): # gentoo wiki if api.startswith('//'): # gentoo wiki
@ -1598,11 +1793,15 @@ def mwGetAPIAndIndex(url=''):
pass # build API using index and check it pass # build API using index and check it
# Index.php # Index.php
m = re.findall(ur'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', result) m = re.findall(
ur'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?',
result)
if m: if m:
index = m[0] index = m[0]
else: else:
m = re.findall(ur'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?', result) m = re.findall(
ur'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?',
result)
if m: if m:
index = m[0] index = m[0]
if index: if index:
@ -1610,7 +1809,13 @@ def mwGetAPIAndIndex(url=''):
index = '/'.join(api.split('/')[:-1]) + '/' + index.split('/')[-1] index = '/'.join(api.split('/')[:-1]) + '/' + index.split('/')[-1]
else: else:
if api: if api:
if len(re.findall(ur'/index\.php5\?', result)) > len(re.findall(ur'/index\.php\?', result)): if len(
re.findall(
ur'/index\.php5\?',
result)) > len(
re.findall(
ur'/index\.php\?',
result)):
index = '/'.join(api.split('/')[:-1]) + '/index.php5' index = '/'.join(api.split('/')[:-1]) + '/index.php5'
else: else:
index = '/'.join(api.split('/')[:-1]) + '/index.php' index = '/'.join(api.split('/')[:-1]) + '/index.php'
@ -1637,8 +1842,11 @@ def main(params=[]):
print '\nWarning!: "%s" path exists' % (config['path']) print '\nWarning!: "%s" path exists' % (config['path'])
reply = '' reply = ''
while reply.lower() not in ['yes', 'y', 'no', 'n']: while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input('There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % ( reply = raw_input(
config['path'], config['path'], configfilename)) 'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %
(config['path'],
config['path'],
configfilename))
if reply.lower() in ['yes', 'y']: if reply.lower() in ['yes', 'y']:
if not os.path.isfile('%s/%s' % (config['path'], configfilename)): if not os.path.isfile('%s/%s' % (config['path'], configfilename)):
print 'No config file found. I can\'t resume. Aborting.' print 'No config file found. I can\'t resume. Aborting.'

Loading…
Cancel
Save