Merge pull request #378 from nemobis/wikia

More efficient Wikia download and launcher.py
pull/328/merge
nemobis 4 years ago committed by GitHub
commit 6e85afca82
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -751,7 +751,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
if not title:
continue
if title == start: # start downloading from start, included
lock = False
@ -767,8 +767,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
text=u'The page "%s" was missing in the wiki (probably deleted)' % title
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
@ -906,7 +905,8 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# End of continuation. We are done with this namespace.
break
except KeyError:
except (KeyError, mwclient.errors.InvalidResponse) as e:
print(e)
# TODO: check whether the KeyError was really for a missing arv API
print "Warning. Could not use allrevisions. Wiki too old?"
if config['curonly']:
@ -916,7 +916,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
c = 0
for title in readTitles(config, start=start):
# TODO: respect verbose flag, reuse output from getXMLPage
print(' {}'.format(title.strip()))
print(u' {}'.format(title))
# TODO: as we're doing one page and revision at a time, we might
# as well use xml format and exportnowrap=1 to use the string of,
# XML as is, but need to check how well the library handles it.
@ -948,18 +948,23 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# refuses to return an arbitrary number of revisions (see above).
print("Getting titles to export all the revisions of each")
c = 0
for title in readTitles(config, start=start):
print(' {}'.format(title.strip()))
titlelist = []
# TODO: Decide a suitable number of a batched request. Careful:
# batched responses may not return all revisions.
for titlelist in readTitles(config, start=start, batch=False):
if type(titlelist) is not list:
titlelist = [titlelist]
for title in titlelist:
print(u' {}'.format(title))
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
pparams = {
'action': 'query',
'titles': title,
'titles': '|'.join(titlelist),
'prop': 'revisions',
'rvlimit': 50,
#'rvlimit': 50,
'rvprop': 'ids|timestamp|user|userid|size|sha1|contentmodel|comment|content',
}
# TODO: we could actually batch titles a bit here if desired. How many?
try:
prequest = site.api(http_method=config['http_method'], **pparams)
except requests.exceptions.HTTPError as e:
@ -967,6 +972,12 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
print("POST request to the API failed, retrying with GET")
config['http_method'] = "GET"
exportrequest = site.api(http_method=config['http_method'], **exportparams)
except mwclient.errors.InvalidResponse:
logerror(
config=config,
text=u'Error: page inaccessible? Could not export page: %s' % ("; ".join(titlelist))
)
continue
# Be ready to iterate if there is continuation.
while True:
@ -978,7 +989,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
except KeyError:
logerror(
config=config,
text=u'Error: page inaccessible? Could not export page: %s' % (title.decode('utf-8'))
text=u'Error: page inaccessible? Could not export page: %s' % ("; ".join(titlelist))
)
break
# Go through the data we got to build the XML.
@ -989,14 +1000,15 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
except PageMissingError:
logerror(
config=config,
text=u'Error: empty revision from API. Could not export page: %s' % (title.decode('utf-8'))
text=u'Error: empty revision from API. Could not export page: %s' % ("; ".join(titlelist))
)
continue
# Get next batch of revisions if there's more.
if 'continue' in prequest.keys():
print("Getting more revisions for page {}".format(title))
pparams['rvcontinue'] = prequest['continue']['rvcontinue']
print("Getting more revisions for the page")
for key, value in prequest['continue']:
params[key] = value
elif 'query-continue' in prequest.keys():
rvstartid = prequest['query-continue']['revisions']['rvstartid']
pparams['rvstartid'] = rvstartid
@ -1011,8 +1023,10 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
config['http_method'] = "GET"
prequest = site.api(http_method=config['http_method'], **pparams)
# We're done iterating for this title.
c += 1
# We're done iterating for this title or titles.
c += len(titlelist)
# Reset for the next batch.
titlelist = []
if c % 10 == 0:
print('Downloaded {} pages'.format(c))
@ -1042,7 +1056,6 @@ def makeXmlFromPage(page):
size = 0
revision = E.revision(
E.id(to_unicode(rev['revid'])),
E.parentid(to_unicode(rev['parentid'])),
E.timestamp(rev['timestamp']),
E.text(to_unicode(rev['*']), space="preserve", bytes=to_unicode(size)),
)
@ -1058,6 +1071,9 @@ def makeXmlFromPage(page):
revision.append(E.comment(to_unicode(rev['comment'])))
if 'contentmodel' in rev:
revision.append(E.model(rev['contentmodel']))
# Sometimes a missing parentid is not replaced with a 0 as it should.
if 'parentid' in rev:
revision.append(E.parentid(to_unicode(rev['parentid'])))
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
if 'sha1' in rev:
revision.append(E.sha1(rev['sha1']))
@ -1067,28 +1083,37 @@ def makeXmlFromPage(page):
raise PageMissingError(page['title'], e)
return etree.tostring(p, pretty_print=True, encoding='unicode')
def readTitles(config={}, start=None):
def readTitles(config={}, start=None, batch=False):
""" Read title list from a file, from the title "start" """
titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date'])
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')
titlelist = []
seeking = False
if start:
seeking = True
with titlesfile as f:
for line in f:
if line.strip() == '--END--':
title = line.decode("utf-8").strip()
if title == '--END--':
break
elif seeking and line.strip() != start:
elif seeking and title != start:
continue
elif seeking and line.strip() == start:
elif seeking and title == start:
seeking = False
yield line.strip()
if not batch:
yield title
else:
yield line.strip()
titlelist.append(title)
if len(titlelist) < batch:
continue
else:
yield titlelist
titlelist = []
def reverse_readline(filename, buf_size=8192, truncate=False):
"""a generator that returns the lines of a file in reverse order"""
@ -1953,7 +1978,7 @@ def checkIndex(index=None, cookies=None, session=None):
""" Checking index.php availability """
r = session.post(url=index, data={'title': 'Special:Version'}, timeout=30)
if r.status_code >= 400:
print("ERROR: The wiki returned status code HTTP {}".format({}))
print("ERROR: The wiki returned status code HTTP {}".format(r.status_code))
return False
raw = r.text
print 'Checking index.php...', index
@ -2164,7 +2189,7 @@ def resumePreviousDump(config={}, other={}):
lastimage = lines[-1]
f.close()
except:
pass # probably file doesnot exists
pass # probably file does not exists
if lastimage == u'--END--':
print 'Image list was completed in the previous session'
else:

@ -46,7 +46,7 @@ def main():
#check if compressed, in that case dump was finished previously
compressed = False
for f in os.listdir('.'):
if f.startswith(prefix) and f.endswith('.7z'):
if f.endswith('.7z') and f.split("-")[0] == prefix:
compressed = True
zipfilename = f
break #stop searching, dot not explore subdirectories
@ -71,7 +71,7 @@ def main():
wikidir = ''
for f in os.listdir('.'):
# Does not find numbered wikidumps not verify directories
if f.startswith(prefix) and f.endswith('wikidump'):
if f.endswith('wikidump') and f.split("-")[0] == prefix:
wikidir = f
started = True
break #stop searching, dot not explore subdirectories
@ -82,14 +82,14 @@ def main():
# typically they don't provide any crawl-delay value in their robots.txt).
if started and wikidir: #then resume
print 'Resuming download, using directory', wikidir
subprocess.call('./dumpgenerator.py --api=%s --xml --images --resume --path=%s' % (wiki, wikidir), shell=True)
subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images', '--resume', '--path={}'.format(wikidir)], shell=False)
else: #download from scratch
subprocess.call('./dumpgenerator.py --api=%s --xml --images --delay=1' % wiki, shell=True)
subprocess.call(['python2', 'dumpgenerator.py', '--api={}'.format(wiki), '--xml', '--images'], shell=False)
started = True
#save wikidir now
for f in os.listdir('.'):
# Does not find numbered wikidumps not verify directories
if f.startswith(prefix) and f.endswith('wikidump'):
if f.endswith('wikidump') and f.split("-")[0] == prefix:
wikidir = f
break #stop searching, dot not explore subdirectories

Loading…
Cancel
Save