@ -257,19 +257,19 @@ def getPageTitlesAPI(config={}, session=None):
# Hack for old versions of MediaWiki API where result is dict
# Hack for old versions of MediaWiki API where result is dict
if isinstance ( allpages , dict ) :
if isinstance ( allpages , dict ) :
allpages = allpages . values ( )
allpages = allpages . values ( )
titles + = [ page [ ' title ' ]
for page in allpages :
for page in allpages ]
yield page [ ' title ' ]
c + = len ( allpages )
if len ( titles ) != len ( set ( titles ) ) :
if len ( titles ) != len ( set ( titles ) ) :
# probably we are in a loop, server returning dupe titles, stop
# probably we are in a loop, server returning dupe titles, stop
# it
# it
print ' Probably a loop, finishing '
print ' Probably a loop, finishing '
titles = list ( set ( titles ) )
titles = list ( set ( titles ) )
apfrom = ' '
apfrom = ' '
c + = len ( allpages )
delay ( config = config , session = session )
delay ( config = config , session = session )
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
print ' %d titles retrieved in the namespace %d ' % ( c , namespace )
return titles
def getPageTitlesScraper ( config = { } , session = None ) :
def getPageTitlesScraper ( config = { } , session = None ) :
""" """
""" """
@ -368,13 +368,21 @@ def getPageTitles(config={}, session=None):
elif ' index ' in config and config [ ' index ' ] :
elif ' index ' in config and config [ ' index ' ] :
titles = getPageTitlesScraper ( config = config , session = session )
titles = getPageTitlesScraper ( config = config , session = session )
# removing dupes (e.g. in CZ appears Widget:AddThis two times (main
titlesfilename = ' %s - %s -titles.txt ' % (
# namespace and widget namespace))
domain2prefix ( config = config ) , config [ ' date ' ] )
titles = sorted ( set ( titles ) )
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' a ' )
c = 0
print ' %d page titles loaded ' % ( len ( titles ) )
for title in titles :
return titles
titlesfile . write ( title . encode ( ' utf-8 ' ) + " \n " )
c + = 1
# TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
# main namespace and widget namespace.
# We can use sort -u in UNIX, but is it worth it?
titlesfile . write ( u ' --END-- \n ' )
titlesfile . close ( )
print ' Titles saved at... ' , titlesfilename
print ' %d page titles loaded ' % ( c )
def getImageNames ( config = { } , session = None ) :
def getImageNames ( config = { } , session = None ) :
""" Get list of image names """
""" Get list of image names """
@ -610,9 +618,9 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
if verbose :
if verbose :
if ( numberofedits == 1 ) :
if ( numberofedits == 1 ) :
print ' %s , 1 edit ' % ( title . encode ( ' utf-8 ' ) )
print ' %s , 1 edit ' % ( title . encode ( ' utf-8 ' ) . strip ( ) )
else :
else :
print ' %s , %d edits ' % ( title . encode ( ' utf-8 ' ) , numberofedits )
print ' %s , %d edits ' % ( title . encode ( ' utf-8 ' ) . strip ( ) , numberofedits )
def cleanXML ( xml = ' ' ) :
def cleanXML ( xml = ' ' ) :
@ -625,8 +633,9 @@ def cleanXML(xml=''):
return xml
return xml
def generateXMLDump ( config = { } , titles = [ ] , start = ' ' , session = None ) :
def generateXMLDump ( config = { } , titles = [ ] , start = None , session = None ) :
""" Generates a XML dump for a list of titles """
""" Generates a XML dump for a list of titles """
# TODO: titles is now unused.
print ' Retrieving the XML for every page from " %s " ' % ( start and start or ' start ' )
print ' Retrieving the XML for every page from " %s " ' % ( start and start or ' start ' )
header , config = getXMLHeader ( config = config , session = session )
header , config = getXMLHeader ( config = config , session = session )
@ -637,32 +646,21 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile = ' '
xmlfile = ' '
lock = True
lock = True
if start :
if start :
# remove the last chunk of xml dump (it is probably incomplete)
print " Removing the last chunk of past XML dump: it is probably incomplete "
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' r ' )
xmlfile = reverse_readline ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) )
xmlfile2 = open ( ' %s / %s 2 ' % ( config [ ' path ' ] , xmlfilename ) , ' w ' )
prev = ' '
c = 0
c = 0
for l in xmlfile :
for l in xmlfile :
# removing <page>\n until end of file
# lock to avoid write an empty line at the begining of file
if c != 0 :
if not re . search ( r ' <title> %s </title> ' % ( start ) , l ) :
xmlfile2 . write ( prev )
else :
break
c + = 1
c + = 1
prev = l
if re . search ( r ' <title> %s </title> ' % ( start ) , l ) :
xmlfile . close ( )
# Done searching. We try to truncate the file at this point:
xmlfile2 . close ( )
# everything should be removed from the line before <title>,
# subst xml with xml2
# that is the last c+1 lines AKA lines from EOF - c to EOF.
# remove previous xml dump
# TODO: do something for users without GNU ed; replace os.
os . remove ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) )
# Try file.seek and file.truncate in the generator again?
# move correctly truncated dump to its real name
os . system ( " (echo ' $- %d ,$d ' ; echo wq ) | ed %s / %s " \
os . rename (
% ( c , config [ ' path ' ] , xmlfilename ) )
' %s / %s 2 ' %
print " Last %d lines removed. " % ( c + 1 )
( config [ ' path ' ] , xmlfilename ) , ' %s / %s ' %
break
( config [ ' path ' ] , xmlfilename )
)
else :
else :
# requested complete xml dump
# requested complete xml dump
lock = False
lock = False
@ -672,7 +670,7 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' a ' )
xmlfile = open ( ' %s / %s ' % ( config [ ' path ' ] , xmlfilename ) , ' a ' )
c = 1
c = 1
for title in titles :
for title in readTitles( config , start ) :
if not title . strip ( ) :
if not title . strip ( ) :
continue
continue
if title == start : # start downloading from start, included
if title == start : # start downloading from start, included
@ -701,19 +699,59 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
xmlfile . close ( )
xmlfile . close ( )
print ' XML dump saved at... ' , xmlfilename
print ' XML dump saved at... ' , xmlfilename
def readTitles ( config = { } , start = None ) :
def saveTitles ( config = { } , titles = [ ] ) :
""" Read title list from a file, from the title " start " """
""" Save title list in a file """
titlesfilename = ' %s - %s -titles.txt ' % (
titlesfilename = ' %s - %s -titles.txt ' % (
domain2prefix ( config = config ) , config [ ' date ' ] )
domain2prefix ( config = config ) , config [ ' date ' ] )
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' w ' )
titlesfile = open ( ' %s / %s ' % ( config [ ' path ' ] , titlesfilename ) , ' r ' )
output = u " %s \n --END-- " % ( ' \n ' . join ( titles ) )
titlesfile . write ( output . encode ( ' utf-8 ' ) )
titlesfile . close ( )
print ' Titles saved at... ' , titlesfilename
seeking = False
if start :
seeking = True
with titlesfile as f :
for line in f :
if line . strip ( ) == ' --END-- ' :
break
elif seeking and line . strip ( ) != start :
continue
elif seeking and line . strip ( ) == start :
seeking = False
yield line . strip ( )
else :
yield line . strip ( )
def reverse_readline ( filename , buf_size = 8192 ) :
""" a generator that returns the lines of a file in reverse order """
# Original code by srohde, abdus_salam: cc by-sa 3.0
# http://stackoverflow.com/a/23646049/718903
with open ( filename ) as fh :
segment = None
offset = 0
fh . seek ( 0 , os . SEEK_END )
total_size = remaining_size = fh . tell ( )
while remaining_size > 0 :
offset = min ( total_size , offset + buf_size )
fh . seek ( - offset , os . SEEK_END )
buffer = fh . read ( min ( remaining_size , buf_size ) )
remaining_size - = buf_size
lines = buffer . split ( ' \n ' )
# the first line of the buffer is probably not a complete line so
# we'll save it and append it to the last line of the next buffer
# we read
if segment is not None :
# if the previous chunk starts right from the beginning of line
# do not concat the segment to the last line of new chunk
# instead, yield the segment first
if buffer [ - 1 ] is not ' \n ' :
lines [ - 1 ] + = segment
else :
yield segment
segment = lines [ 0 ]
for index in range ( len ( lines ) - 1 , 0 , - 1 ) :
yield lines [ index ]
yield segment
def saveImageNames ( config = { } , images = [ ] , session = None ) :
def saveImageNames ( config = { } , images = [ ] , session = None ) :
""" Save image list in a file, including filename, url and uploader """
""" Save image list in a file, including filename, url and uploader """
@ -1525,12 +1563,11 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
def createNewDump ( config = { } , other = { } ) :
def createNewDump ( config = { } , other = { } ) :
titles = [ ]
images = [ ]
images = [ ]
print ' Trying generating a new dump into a new directory... '
print ' Trying generating a new dump into a new directory... '
if config [ ' xml ' ] :
if config [ ' xml ' ] :
titles + = getPageTitles( config = config , session = other [ ' session ' ] )
getPageTitles( config = config , session = other [ ' session ' ] )
saveTitles( config = config , titles = titles )
titles= readTitles ( config )
generateXMLDump ( config = config , titles = titles , session = other [ ' session ' ] )
generateXMLDump ( config = config , titles = titles , session = other [ ' session ' ] )
checkXMLIntegrity (
checkXMLIntegrity (
config = config ,
config = config ,
@ -1549,23 +1586,21 @@ def createNewDump(config={}, other={}):
def resumePreviousDump ( config = { } , other = { } ) :
def resumePreviousDump ( config = { } , other = { } ) :
titles = [ ]
images = [ ]
images = [ ]
print ' Resuming previous dump process... '
print ' Resuming previous dump process... '
if config [ ' xml ' ] :
if config [ ' xml ' ] :
# load titles
titles = readTitles ( config )
lasttitle = ' '
try :
try :
f = open ( ' %s / %s - %s -titles.txt ' % ( config [ ' path ' ] , domain2prefix (
lasttitles = reverse_readline ( ' %s / %s - %s -titles.txt ' %
config = config , session = other [ ' session ' ] ) , config [ ' date ' ] ) , ' r ' )
( config [ ' path ' ] ,
raw = unicode ( f . read ( ) , ' utf-8 ' )
domain2prefix ( config = config , session = other [ ' session ' ] ) ,
titles = raw . split ( ' \n ' )
config [ ' date ' ] )
lasttitle = titles [ - 1 ]
)
if not lasttitle : # empty line at EOF ?
lasttitle = lasttitles . next ( )
lasttitle = titles [ - 2 ]
if lasttitle == ' ' :
f . close ( )
lasttitle = lasttitles . next ( )
except :
except :
pass # probably file does not exists
pass # probably file does not exists
if lasttitle == ' --END-- ' :
if lasttitle == ' --END-- ' :
# titles list is complete
# titles list is complete
print ' Title list was completed in the previous session '
print ' Title list was completed in the previous session '
@ -1573,13 +1608,13 @@ def resumePreviousDump(config={}, other={}):
print ' Title list is incomplete. Reloading... '
print ' Title list is incomplete. Reloading... '
# do not resume, reload, to avoid inconsistences, deleted pages or
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
# so
titles = getPageTitles( config = config , session = other [ ' session ' ] )
getPageTitles( config = config , session = other [ ' session ' ] )
saveTitles ( config = config , titles = titles )
# checking xml dump
# checking xml dump
xmliscomplete = False
xmliscomplete = False
lastxmltitle = ' '
lastxmltitle = None
try :
try :
f = open (
f = reverse_readline (
' %s / %s - %s - %s .xml ' %
' %s / %s - %s - %s .xml ' %
( config [ ' path ' ] ,
( config [ ' path ' ] ,
domain2prefix (
domain2prefix (
@ -1587,27 +1622,26 @@ def resumePreviousDump(config={}, other={}):
session = other [ ' session ' ] ) ,
session = other [ ' session ' ] ) ,
config [ ' date ' ] ,
config [ ' date ' ] ,
config [ ' curonly ' ] and ' current ' or ' history ' ) ,
config [ ' curonly ' ] and ' current ' or ' history ' ) ,
' r ' )
)
for l in f :
for l in f :
if re. findall ( ' </mediawiki> ' , l ) :
if l == ' </mediawiki> ' :
# xml dump is complete
# xml dump is complete
xmliscomplete = True
xmliscomplete = True
break
break
# weird if found more than 1, but maybe
xmltitle s = re . findall ( r ' <title>([^<]+)</title> ' , l )
xmltitle = re . search ( r ' <title>([^<]+)</title> ' , l )
if xmltitle s :
if xmltitle :
lastxmltitle = undoHTMLEntities ( text = xmltitle s[ - 1 ] )
lastxmltitle = undoHTMLEntities ( text = xmltitle . group ( 1 ) )
f . close ( )
break
except :
except :
pass # probably file doesnot exists
pass # probably file does not exists
# removing --END-- before getXMLs
while titles and titles [ - 1 ] in [ ' ' , ' --END-- ' ] :
titles = titles [ : - 1 ]
if xmliscomplete :
if xmliscomplete :
print ' XML dump was completed in the previous session '
print ' XML dump was completed in the previous session '
elif lastxmltitle :
elif lastxmltitle :
# resuming...
# resuming...
print ' Resuming XML dump from " %s " ' % ( lastxmltitle )
print ' Resuming XML dump from " %s " ' % ( lastxmltitle )
titles = readTitles ( config , start = lastxmltitle )
generateXMLDump (
generateXMLDump (
config = config ,
config = config ,
titles = titles ,
titles = titles ,
@ -1616,6 +1650,7 @@ def resumePreviousDump(config={}, other={}):
else :
else :
# corrupt? only has XML header?
# corrupt? only has XML header?
print ' XML is corrupt? Regenerating... '
print ' XML is corrupt? Regenerating... '
titles = readTitles ( config )
generateXMLDump (
generateXMLDump (
config = config , titles = titles , session = other [ ' session ' ] )
config = config , titles = titles , session = other [ ' session ' ] )