Merge branch '2015/iterators' of git://github.com/nemobis/wikiteam into nemobis-2015/iterators

Conflicts:
	requirements.txt
pull/240/head
Federico Leva 9 years ago
commit 2b78bfb795

@ -19,6 +19,10 @@
# To learn more, read the documentation: # To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki # https://github.com/WikiTeam/wikiteam/wiki
try:
from kitchen.text.converters import getwriter
except ImportError:
print "Please install the kitchen module."
import cookielib import cookielib
import cPickle import cPickle
import datetime import datetime
@ -42,6 +46,8 @@ except ImportError:
sys.exit(1) sys.exit(1)
import time import time
import urllib import urllib
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
__VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org __VERSION__ = '0.3.0-alpha' # major, minor, micro: semver.org
@ -257,19 +263,19 @@ def getPageTitlesAPI(config={}, session=None):
# Hack for old versions of MediaWiki API where result is dict # Hack for old versions of MediaWiki API where result is dict
if isinstance(allpages, dict): if isinstance(allpages, dict):
allpages = allpages.values() allpages = allpages.values()
titles += [page['title'] for page in allpages:
for page in allpages] yield page['title']
c += len(allpages)
if len(titles) != len(set(titles)): if len(titles) != len(set(titles)):
# probably we are in a loop, server returning dupe titles, stop # probably we are in a loop, server returning dupe titles, stop
# it # it
print 'Probably a loop, finishing' print 'Probably a loop, finishing'
titles = list(set(titles)) titles = list(set(titles))
apfrom = '' apfrom = ''
c += len(allpages)
delay(config=config, session=session) delay(config=config, session=session)
print ' %d titles retrieved in the namespace %d' % (c, namespace) print ' %d titles retrieved in the namespace %d' % (c, namespace)
return titles
def getPageTitlesScraper(config={}, session=None): def getPageTitlesScraper(config={}, session=None):
""" """ """ """
@ -368,13 +374,21 @@ def getPageTitles(config={}, session=None):
elif 'index' in config and config['index']: elif 'index' in config and config['index']:
titles = getPageTitlesScraper(config=config, session=session) titles = getPageTitlesScraper(config=config, session=session)
# removing dupes (e.g. in CZ appears Widget:AddThis two times (main titlesfilename = '%s-%s-titles.txt' % (
# namespace and widget namespace)) domain2prefix(config=config), config['date'])
titles = sorted(set(titles)) titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'a')
c = 0
print '%d page titles loaded' % (len(titles)) for title in titles:
return titles titlesfile.write(title.encode('utf-8') + "\n")
c += 1
# TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
# main namespace and widget namespace.
# We can use sort -u in UNIX, but is it worth it?
titlesfile.write(u'--END--\n')
titlesfile.close()
print 'Titles saved at...', titlesfilename
print '%d page titles loaded' % (c)
def getImageNames(config={}, session=None): def getImageNames(config={}, session=None):
""" Get list of image names """ """ Get list of image names """
@ -430,6 +444,7 @@ def getXMLHeader(config={}, session=None):
header = xml.split('</mediawiki>')[0] header = xml.split('</mediawiki>')[0]
if not re.match(r"\s*<mediawiki", xml): if not re.match(r"\s*<mediawiki", xml):
print 'XML export on this wiki is broken, quitting.' print 'XML export on this wiki is broken, quitting.'
logerror(u'XML export on this wiki is broken, quitting.')
sys.exit() sys.exit()
return header, config return header, config
@ -492,7 +507,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
params['curonly'] = 1 params['curonly'] = 1
logerror( logerror(
config=config, config=config,
text='Error while retrieving the full history of "%s". Trying to save only the last revision for this page' % text=u'Error while retrieving the full history of "%s". Trying to save only the last revision for this page' %
(params['pages']) (params['pages'])
) )
return getXMLPageCore( return getXMLPageCore(
@ -505,7 +520,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
print ' Saving in the errors log, and skipping...' print ' Saving in the errors log, and skipping...'
logerror( logerror(
config=config, config=config,
text='Error while retrieving the last revision of "%s". Skipping.' % text=u'Error while retrieving the last revision of "%s". Skipping.' %
(params['pages'])) (params['pages']))
raise ExportAbortedError(config['index']) raise ExportAbortedError(config['index'])
return '' # empty xml return '' # empty xml
@ -601,7 +616,12 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
# offset is OK in this wiki, merge with the previous chunk # offset is OK in this wiki, merge with the previous chunk
# of this page history and continue # of this page history and continue
xml2 = xml2.split("</page>")[0] xml2 = xml2.split("</page>")[0]
yield ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:])) try:
yield ' <revision>' + ('<revision>'.join(xml2.split('<revision>')[1:]))
except MemoryError:
print "The page's history exceeds our memory, halving limit."
params['limit'] = params['limit'] / 2
continue
xml = xml2 xml = xml2
numberofedits += len(re.findall(r_timestamp, xml)) numberofedits += len(re.findall(r_timestamp, xml))
else: else:
@ -610,9 +630,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
if verbose: if verbose:
if (numberofedits == 1): if (numberofedits == 1):
print ' %s, 1 edit' % (title.encode('utf-8')) print ' %s, 1 edit' % (title.strip())
else: else:
print ' %s, %d edits' % (title.encode('utf-8'), numberofedits) print ' %s, %d edits' % (title.strip(), numberofedits)
def cleanXML(xml=''): def cleanXML(xml=''):
@ -625,8 +645,9 @@ def cleanXML(xml=''):
return xml return xml
def generateXMLDump(config={}, titles=[], start='', session=None): def generateXMLDump(config={}, titles=[], start=None, session=None):
""" Generates a XML dump for a list of titles """ """ Generates a XML dump for a list of titles """
# TODO: titles is now unused.
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header, config = getXMLHeader(config=config, session=session) header, config = getXMLHeader(config=config, session=session)
@ -637,32 +658,9 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile = '' xmlfile = ''
lock = True lock = True
if start: if start:
# remove the last chunk of xml dump (it is probably incomplete) print "Removing the last chunk of past XML dump: it is probably incomplete."
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'r') for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
xmlfile2 = open('%s/%s2' % (config['path'], xmlfilename), 'w') pass
prev = ''
c = 0
for l in xmlfile:
# removing <page>\n until end of file
# lock to avoid write an empty line at the begining of file
if c != 0:
if not re.search(r'<title>%s</title>' % (start), l):
xmlfile2.write(prev)
else:
break
c += 1
prev = l
xmlfile.close()
xmlfile2.close()
# subst xml with xml2
# remove previous xml dump
os.remove('%s/%s' % (config['path'], xmlfilename))
# move correctly truncated dump to its real name
os.rename(
'%s/%s2' %
(config['path'], xmlfilename), '%s/%s' %
(config['path'], xmlfilename)
)
else: else:
# requested complete xml dump # requested complete xml dump
lock = False lock = False
@ -672,7 +670,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a') xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1 c = 1
for title in titles: for title in readTitles(config, start):
if not title.strip(): if not title.strip():
continue continue
if title == start: # start downloading from start, included if title == start: # start downloading from start, included
@ -690,7 +688,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
logerror( logerror(
config=config, config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' % text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title) (title.decode('utf-8'))
) )
# here, XML is a correct <page> </page> chunk or # here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or # an empty string due to a deleted page (logged in errors log) or
@ -701,19 +699,71 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile.close() xmlfile.close()
print 'XML dump saved at...', xmlfilename print 'XML dump saved at...', xmlfilename
def readTitles(config={}, start=None):
def saveTitles(config={}, titles=[]): """ Read title list from a file, from the title "start" """
""" Save title list in a file """
titlesfilename = '%s-%s-titles.txt' % ( titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date']) domain2prefix(config=config), config['date'])
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w') titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'r')
output = u"%s\n--END--" % ('\n'.join(titles))
titlesfile.write(output.encode('utf-8'))
titlesfile.close()
print 'Titles saved at...', titlesfilename seeking = False
if start:
seeking = True
with titlesfile as f:
for line in f:
if line.strip() == '--END--':
break
elif seeking and line.strip() != start:
continue
elif seeking and line.strip() == start:
seeking = False
yield line.strip()
else:
yield line.strip()
def reverse_readline(filename, buf_size=8192, truncate=False):
"""a generator that returns the lines of a file in reverse order"""
# Original code by srohde, abdus_salam: cc by-sa 3.0
# http://stackoverflow.com/a/23646049/718903
with open(filename, 'r+') as fh:
segment = None
offset = 0
fh.seek(0, os.SEEK_END)
total_size = remaining_size = fh.tell()
while remaining_size > 0:
offset = min(total_size, offset + buf_size)
fh.seek(-offset, os.SEEK_END)
buffer = fh.read(min(remaining_size, buf_size))
remaining_size -= buf_size
lines = buffer.split('\n')
# the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer
# we read
if segment is not None:
# if the previous chunk starts right from the beginning of line
# do not concat the segment to the last line of new chunk
# instead, yield the segment first
if buffer[-1] is not '\n':
lines[-1] += segment
else:
if truncate and '</page>' in segment:
pages = buffer.split('</page>')
fh.seek(-offset+buf_size-len(pages[-1]), os.SEEK_END)
fh.truncate
raise StopIteration
else:
yield segment
segment = lines[0]
for index in range(len(lines) - 1, 0, -1):
if truncate and '</page>' in segment:
pages = buffer.split('</page>')
fh.seek(-offset-len(pages[-1]), os.SEEK_END)
fh.truncate
raise StopIteration
else:
yield lines[index]
yield segment
def saveImageNames(config={}, images=[], session=None): def saveImageNames(config={}, images=[], session=None):
""" Save image list in a file, including filename, url and uploader """ """ Save image list in a file, including filename, url and uploader """
@ -1033,7 +1083,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
xmlfiledesc = '' xmlfiledesc = ''
logerror( logerror(
config=config, config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' % (title) text=u'The page "%s" was missing in the wiki (probably deleted)' % (title.decode('utf-8'))
) )
f = open('%s/%s.desc' % (imagepath, filename2), 'w') f = open('%s/%s.desc' % (imagepath, filename2), 'w')
@ -1539,12 +1589,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
def createNewDump(config={}, other={}): def createNewDump(config={}, other={}):
titles = []
images = [] images = []
print 'Trying generating a new dump into a new directory...' print 'Trying generating a new dump into a new directory...'
if config['xml']: if config['xml']:
titles += getPageTitles(config=config, session=other['session']) getPageTitles(config=config, session=other['session'])
saveTitles(config=config, titles=titles) titles=readTitles(config)
generateXMLDump(config=config, titles=titles, session=other['session']) generateXMLDump(config=config, titles=titles, session=other['session'])
checkXMLIntegrity( checkXMLIntegrity(
config=config, config=config,
@ -1563,23 +1612,21 @@ def createNewDump(config={}, other={}):
def resumePreviousDump(config={}, other={}): def resumePreviousDump(config={}, other={}):
titles = []
images = [] images = []
print 'Resuming previous dump process...' print 'Resuming previous dump process...'
if config['xml']: if config['xml']:
# load titles titles=readTitles(config)
lasttitle = ''
try: try:
f = open('%s/%s-%s-titles.txt' % (config['path'], domain2prefix( lasttitles = reverse_readline('%s/%s-%s-titles.txt' %
config=config, session=other['session']), config['date']), 'r') ( config['path'],
raw = unicode(f.read(), 'utf-8') domain2prefix( config=config, session=other['session'] ),
titles = raw.split('\n') config['date'])
lasttitle = titles[-1] )
if not lasttitle: # empty line at EOF ? lasttitle=lasttitles.next()
lasttitle = titles[-2] if lasttitle == '':
f.close() lasttitle=lasttitles.next()
except: except:
pass # probably file doesnot exists pass # probably file does not exists
if lasttitle == '--END--': if lasttitle == '--END--':
# titles list is complete # titles list is complete
print 'Title list was completed in the previous session' print 'Title list was completed in the previous session'
@ -1587,13 +1634,13 @@ def resumePreviousDump(config={}, other={}):
print 'Title list is incomplete. Reloading...' print 'Title list is incomplete. Reloading...'
# do not resume, reload, to avoid inconsistences, deleted pages or # do not resume, reload, to avoid inconsistences, deleted pages or
# so # so
titles = getPageTitles(config=config, session=other['session']) getPageTitles(config=config, session=other['session'])
saveTitles(config=config, titles=titles)
# checking xml dump # checking xml dump
xmliscomplete = False xmliscomplete = False
lastxmltitle = '' lastxmltitle = None
try: try:
f = open( f = reverse_readline(
'%s/%s-%s-%s.xml' % '%s/%s-%s-%s.xml' %
(config['path'], (config['path'],
domain2prefix( domain2prefix(
@ -1601,27 +1648,26 @@ def resumePreviousDump(config={}, other={}):
session=other['session']), session=other['session']),
config['date'], config['date'],
config['curonly'] and 'current' or 'history'), config['curonly'] and 'current' or 'history'),
'r') )
for l in f: for l in f:
if re.findall('</mediawiki>', l): if l == '</mediawiki>':
# xml dump is complete # xml dump is complete
xmliscomplete = True xmliscomplete = True
break break
# weird if found more than 1, but maybe
xmltitles = re.findall(r'<title>([^<]+)</title>', l) xmltitle = re.search(r'<title>([^<]+)</title>', l)
if xmltitles: if xmltitle:
lastxmltitle = undoHTMLEntities(text=xmltitles[-1]) lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
f.close() break
except: except:
pass # probably file doesnot exists pass # probably file does not exists
# removing --END-- before getXMLs
while titles and titles[-1] in ['', '--END--']:
titles = titles[:-1]
if xmliscomplete: if xmliscomplete:
print 'XML dump was completed in the previous session' print 'XML dump was completed in the previous session'
elif lastxmltitle: elif lastxmltitle:
# resuming... # resuming...
print 'Resuming XML dump from "%s"' % (lastxmltitle) print 'Resuming XML dump from "%s"' % (lastxmltitle)
titles = readTitles(config, start=lastxmltitle)
generateXMLDump( generateXMLDump(
config=config, config=config,
titles=titles, titles=titles,
@ -1630,6 +1676,7 @@ def resumePreviousDump(config={}, other={}):
else: else:
# corrupt? only has XML header? # corrupt? only has XML header?
print 'XML is corrupt? Regenerating...' print 'XML is corrupt? Regenerating...'
titles = readTitles(config)
generateXMLDump( generateXMLDump(
config=config, titles=titles, session=other['session']) config=config, titles=titles, session=other['session'])

@ -1,3 +1,4 @@
argparse>=1.2.1 argparse>=1.2.1
requests>=2.3.0 requests>=2.3.0
internetarchive internetarchive
kitchen

Loading…
Cancel
Save