Merge branch '2015/iterators' of https://github.com/nemobis/wikiteam into nemobis-2015/iterators

pull/228/head
Federico Leva 9 years ago
commit 632b99ea53

@ -19,6 +19,10 @@
# To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki
try:
from kitchen.text.converters import getwriter
except ImportError:
print "Please install the kitchen module."
import cookielib
import cPickle
import datetime
@ -42,6 +46,8 @@ except ImportError:
sys.exit(1)
import time
import urllib
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
__VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org
@ -52,6 +58,12 @@ class PageMissingError(Exception):
def __str__(self):
return "page '%s' not found" % self.title
class ExportAbortedError(Exception):
def __init__(self, index):
self.index = index
def __str__(self):
return "Export from '%s' did not return anything." % self.index
def getVersion():
return(__VERSION__)
@ -251,19 +263,19 @@ def getPageTitlesAPI(config={}, session=None):
# Hack for old versions of MediaWiki API where result is dict
if isinstance(allpages, dict):
allpages = allpages.values()
titles += [page['title']
for page in allpages]
for page in allpages:
yield page['title']
c += len(allpages)
if len(titles) != len(set(titles)):
# probably we are in a loop, server returning dupe titles, stop
# it
print 'Probably a loop, finishing'
titles = list(set(titles))
apfrom = ''
c += len(allpages)
delay(config=config, session=session)
print ' %d titles retrieved in the namespace %d' % (c, namespace)
return titles
def getPageTitlesScraper(config={}, session=None):
""" """
@ -362,13 +374,21 @@ def getPageTitles(config={}, session=None):
elif 'index' in config and config['index']:
titles = getPageTitlesScraper(config=config, session=session)
# removing dupes (e.g. in CZ appears Widget:AddThis two times (main
# namespace and widget namespace))
titles = sorted(set(titles))
print '%d page titles loaded' % (len(titles))
return titles
titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date'])
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'a')
c = 0
for title in titles:
titlesfile.write(title.encode('utf-8') + "\n")
c += 1
# TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
# main namespace and widget namespace.
# We can use sort -u in UNIX, but is it worth it?
titlesfile.write(u'--END--\n')
titlesfile.close()
print 'Titles saved at...', titlesfilename
print '%d page titles loaded' % (c)
def getImageNames(config={}, session=None):
""" Get list of image names """
@ -396,13 +416,36 @@ def getXMLHeader(config={}, session=None):
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError:
try:
if config['api']:
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'}
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
header = xml.split('</mediawiki>')[0]
if not re.match("<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.'
sys.exit()
return header
return header, config
def getXMLFileDesc(config={}, title='', session=None):
@ -454,11 +497,11 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save nonly the last
# config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# getXMLPageCore
if not config['curonly']:
if not config['curonly'] and not 'curonly' in params:
print ' Trying to save only the last revision for this page...'
params['curonly'] = 1
logerror(
@ -478,6 +521,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
config=config,
text='Error while retrieving the last revision of "%s". Skipping.' %
(params['pages']))
raise ExportAbortedError(config['index'])
return '' # empty xml
# FIXME HANDLE HTTP Errors HERE
try:
@ -485,6 +529,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
handleStatusCode(r)
xml = r.text
except requests.exceptions.ConnectionError as e:
raise ExportAbortedError(config['index'])
xml = ''
c += 1
@ -502,7 +547,10 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
title_ = title
title_ = re.sub(' ', '_', title_)
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
try:
params = {'title': config['export'], 'pages': title_, 'action': 'submit'}
except KeyError:
params = {'title': 'Special:Export', 'pages': title_, 'action': 'submit'}
if config['curonly']:
params['curonly'] = 1
params['limit'] = 1
@ -514,6 +562,8 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
params['templates'] = 1
xml = getXMLPageCore(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config['index'])
if not "</page>" in xml:
raise PageMissingError(params['title'], xml)
else:
@ -574,9 +624,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
if verbose:
if (numberofedits == 1):
print ' %s, 1 edit' % (title.encode('utf-8'))
print ' %s, 1 edit' % (title.strip())
else:
print ' %s, %d edits' % (title.encode('utf-8'), numberofedits)
print ' %s, %d edits' % (title.strip(), numberofedits)
def cleanXML(xml=''):
@ -589,11 +639,12 @@ def cleanXML(xml=''):
return xml
def generateXMLDump(config={}, titles=[], start='', session=None):
def generateXMLDump(config={}, titles=[], start=None, session=None):
""" Generates a XML dump for a list of titles """
# TODO: titles is now unused.
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header = getXMLHeader(config=config, session=session)
header, config = getXMLHeader(config=config, session=session)
footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
config['date'],
@ -601,32 +652,21 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile = ''
lock = True
if start:
# remove the last chunk of xml dump (it is probably incomplete)
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r')
xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w')
prev = ''
print "Removing the last chunk of past XML dump: it is probably incomplete"
xmlfile = reverse_readline('%s/%s' % (config['path'], xmlfilename))
c = 0
for l in xmlfile:
# removing <page>\n until end of file
# lock to avoid write an empty line at the begining of file
if c != 0:
if not re.search(r'<title>%s</title>' % (start), l):
xmlfile2.write(prev)
else:
break
c += 1
prev = l
xmlfile.close()
xmlfile2.close()
# subst xml with xml2
# remove previous xml dump
os.remove('%s/%s' % (config['path'], xmlfilename))
# move correctly truncated dump to its real name
os.rename(
'%s/%s2' %
(config['path'], xmlfilename), '%s/%s' %
(config['path'], xmlfilename)
)
if re.search(r'<title>%s</title>' % (start), l):
# Done searching. We try to truncate the file at this point:
# everything should be removed from the line before <title>,
# that is the last c+1 lines AKA lines from EOF - c to EOF.
# TODO: do something for users without GNU ed; replace os.
# Try file.seek and file.truncate in the generator again?
os.system("(echo '$-%d,$d'; echo wq ) | ed %s/%s" \
% (c, config['path'], xmlfilename) )
print "Last %d lines removed." % (c+1)
break
else:
# requested complete xml dump
lock = False
@ -636,7 +676,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in titles:
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
@ -665,19 +705,59 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def saveTitles(config={}, titles=[]):
""" Save title list in a file """
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date'])
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
output = u"%s\n--END--" % ('\n'.join(titles))
titlesfile.write(output.encode('utf-8'))
titlesfile.close()
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')
print 'Titles saved at...', titlesfilename
seeking = False
if start:
seeking = True
with titlesfile as f:
for line in f:
if line.strip() == '--END--':
break
elif seeking and line.strip() != start:
continue
elif seeking and line.strip() == start:
seeking = False
yield line.strip()
else:
yield line.strip()
def reverse_readline(filename, buf_size=8192):
"""a generator that returns the lines of a file in reverse order"""
# Original code by srohde, abdus_salam: cc by-sa 3.0
# http://stackoverflow.com/a/23646049/718903
with open(filename) as fh:
segment = None
offset = 0
fh.seek(0, os.SEEK_END)
total_size = remaining_size = fh.tell()
while remaining_size > 0:
offset = min(total_size, offset + buf_size)
fh.seek(-offset, os.SEEK_END)
buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size
lines = buffer.split('\n')
# the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer
# we read
if segment is not None:
# if the previous chunk starts right from the beginning of line
# do not concat the segment to the last line of new chunk
# instead, yield the segment first
if buffer[-1] is not '\n':
lines[-1] += segment
else:
yield segment
segment = lines[0]
for index in range(len(lines) - 1, 0, -1):
yield lines[index]
yield segment
def saveImageNames(config={}, images=[], session=None):
""" Save image list in a file, including filename, url and uploader """
@ -1489,12 +1569,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
def createNewDump(config={}, other={}):
titles = []
images = []
print 'Trying generating a new dump into a new directory...'
if config['xml']:
titles += getPageTitles(config=config, session=other['session'])
saveTitles(config=config, titles=titles)
getPageTitles(config=config, session=other['session'])
titles=readTitles(config)
generateXMLDump(config=config, titles=titles, session=other['session'])
checkXMLIntegrity(
config=config,
@ -1513,23 +1592,21 @@ def createNewDump(config={}, other={}):
def resumePreviousDump(config={}, other={}):
titles = []
images = []
print 'Resuming previous dump process...'
if config['xml']:
# load titles
lasttitle = ''
titles=readTitles(config)
try:
f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix(
config=config, session=other['session']), config['date']), 'r')
raw = unicode(f.read(), 'utf-8')
titles = raw.split('\n')
lasttitle = titles[-1]
if not lasttitle: # empty line at EOF ?
lasttitle = titles[-2]
f.close()
lasttitles = reverse_readline('%s/%s-%s-titles.txt' %
( config['path'],
domain2prefix( config=config, session=other['session'] ),
config['date'])
)
lasttitle=lasttitles.next()
if lasttitle == '':
lasttitle=lasttitles.next()
except:
pass # probably file doesnot exists
pass # probably file does not exists
if lasttitle == '--END--':
# titles list is complete
print 'Title list was completed in the previous session'
@ -1537,13 +1614,13 @@ def resumePreviousDump(config={}, other={}):
print 'Title list is incomplete. Reloading...'
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
titles = getPageTitles(config=config, session=other['session'])
saveTitles(config=config, titles=titles)
getPageTitles(config=config, session=other['session'])
# checking xml dump
xmliscomplete = False
lastxmltitle = ''
lastxmltitle = None
try:
f = open(
f = reverse_readline(
'%s/%s-%s-%s.xml' %
(config['path'],
domain2prefix(
@ -1551,27 +1628,26 @@ def resumePreviousDump(config={}, other={}):
session=other['session']),
config['date'],
config['curonly'] and 'current' or 'history'),
'r')
)
for l in f:
if re.findall('</mediawiki>', l):
if l == '</mediawiki>':
# xml dump is complete
xmliscomplete = True
break
# weird if found more than 1, but maybe
xmltitles = re.findall(r'<title>([^<]+)</title>', l)
if xmltitles:
lastxmltitle = undoHTMLEntities(text=xmltitles[-1])
f.close()
xmltitle = re.search(r'<title>([^<]+)</title>', l)
if xmltitle:
lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
break
except:
pass # probably file doesnot exists
# removing --END-- before getXMLs
while titles and titles[-1] in ['', '--END--']:
titles = titles[:-1]
pass # probably file does not exists
if xmliscomplete:
print 'XML dump was completed in the previous session'
elif lastxmltitle:
# resuming...
print 'Resuming XML dump from "%s"' % (lastxmltitle)
titles = readTitles(config, start=lastxmltitle)
generateXMLDump(
config=config,
titles=titles,
@ -1580,6 +1656,7 @@ def resumePreviousDump(config={}, other={}):
else:
# corrupt? only has XML header?
print 'XML is corrupt? Regenerating...'
titles = readTitles(config)
generateXMLDump(
config=config, titles=titles, session=other['session'])

Loading…
Cancel
Save