@ -45,6 +45,8 @@ import urllib
__VERSION__ = ' 0.3.0-alpha ' # major, minor, micro: semver.org
__VERSION__ = ' 0.3.0-alpha ' # major, minor, micro: semver.org
class PageMissingError ( Exception ) :
pass
def getVersion ( ) :
def getVersion ( ) :
return ( __VERSION__ )
return ( __VERSION__ )
@ -387,10 +389,10 @@ def getXMLHeader(config={}, session=None):
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x....
# xmlns:x....
randomtitle = ' Main_Page ' # previously AMF5LKE43MNFGHKSDMRTJ
randomtitle = ' Main_Page ' # previously AMF5LKE43MNFGHKSDMRTJ
xml = getXMLPage (
try :
config= config , title = randomtitle , verbose = False , session = session )
xml = " " . join ( [ x for x in getXMLPage ( config= config , title = randomtitle , verbose = False , session = session ) ] )
header = xml . split ( ' </mediawiki> ' ) [ 0 ]
header = xml . split ( ' </mediawiki> ' ) [ 0 ]
if not xml :
except PageMissingError :
print ' XML export on this wiki is broken, quitting. '
print ' XML export on this wiki is broken, quitting. '
sys . exit ( )
sys . exit ( )
return header
return header
@ -399,12 +401,7 @@ def getXMLHeader(config={}, session=None):
def getXMLFileDesc ( config = { } , title = ' ' , session = None ) :
def getXMLFileDesc ( config = { } , title = ' ' , session = None ) :
""" Get XML for image description page """
""" Get XML for image description page """
config [ ' curonly ' ] = 1 # tricky to get only the most recent desc
config [ ' curonly ' ] = 1 # tricky to get only the most recent desc
return getXMLPage (
return ( " " . join ( [ x for x in getXMLPage ( config = config , title = title , verbose = False , session = session ) ] ) )
config = config ,
title = title ,
verbose = False ,
session = session
)
def getUserAgent ( ) :
def getUserAgent ( ) :
@ -510,10 +507,18 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
params [ ' templates ' ] = 1
params [ ' templates ' ] = 1
xml = getXMLPageCore ( params = params , config = config , session = session )
xml = getXMLPageCore ( params = params , config = config , session = session )
if not xml :
raise PageMissingError
yield xml . split ( " </page> " ) [ 0 ]
# if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
# if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
# else, warning about Special:Export truncating large page histories
# else, warning about Special:Export truncating large page histories
r_timestamp = r ' <timestamp>([^<]+)</timestamp> '
r_timestamp = r ' <timestamp>([^<]+)</timestamp> '
numberofedits = 0
numberofedits + = len ( re . findall ( r_timestamp , xml ) )
# search for timestamps in xml to avoid analysing empty pages like
# search for timestamps in xml to avoid analysing empty pages like
# Special:Allpages and the random one
# Special:Allpages and the random one
if not config [ ' curonly ' ] and re . search ( r_timestamp , xml ) :
if not config [ ' curonly ' ] and re . search ( r_timestamp , xml ) :
@ -546,26 +551,27 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
"""
"""
# offset is OK in this wiki, merge with the previous chunk
# offset is OK in this wiki, merge with the previous chunk
# of this page history and continue
# of this page history and continue
xml = xml . split (
xml2 = xml2 . split ( " </page> " ) [ 0 ]
' </page> ' ) [ 0 ] + ' <revision> ' + ( ' <revision> ' . join ( xml2 . split ( ' <revision> ' ) [ 1 : ] ) )
yield ' <revision> ' + ( ' <revision> ' . join ( xml2 . split ( ' <revision> ' ) [ 1 : ] ) )
xml = xml2
numberofedits + = len ( re . findall ( r_timestamp , xml ) )
else :
else :
params [ ' offset ' ] = ' ' # no more edits in this page history
params [ ' offset ' ] = ' ' # no more edits in this page history
yield " </page> \n "
if verbose :
if verbose :
numberofedits = len ( re . findall ( r_timestamp , xml ) )
if ( numberofedits == 1 ) :
if ( numberofedits == 1 ) :
print ' %s , 1 edit ' % ( title . encode ( ' utf-8 ' ) )
print ' %s , 1 edit ' % ( title . encode ( ' utf-8 ' ) )
else :
else :
print ' %s , %d edits ' % ( title . encode ( ' utf-8 ' ) , numberofedits )
print ' %s , %d edits ' % ( title . encode ( ' utf-8 ' ) , numberofedits )
return xml
def cleanXML ( xml = ' ' ) :
def cleanXML ( xml = ' ' ) :
""" Trim redundant info """
""" Trim redundant info """
# do not touch XML codification, leave AS IS
# do not touch XML codification, leave AS IS
if re . search ( r ' </siteinfo> \ n ' , xml ) and re . search ( r ' </mediawiki> ' , xml ) :
if re . search ( r ' </siteinfo> \ n ' , xml ) :
xml = xml . split ( ' </siteinfo> \n ' ) [ 1 ]
xml = xml . split ( ' </siteinfo> \n ' ) [ 1 ]
if re . search ( r ' </mediawiki> ' , xml ) :
xml = xml . split ( ' </mediawiki> ' ) [ 0 ]
xml = xml . split ( ' </mediawiki> ' ) [ 0 ]
return xml
return xml
@ -627,9 +633,11 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
delay ( config = config , session = session )
delay ( config = config , session = session )
if c % 10 == 0 :
if c % 10 == 0 :
print ' Downloaded %d pages ' % ( c )
print ' Downloaded %d pages ' % ( c )
xml = getXMLPage ( config = config , title = title , session = session )
try :
xml = cleanXML ( xml = xml )
for xml in getXMLPage ( config = config , title = title , session = session ) :
if not xml :
xml = cleanXML ( xml = xml )
xmlfile . write ( xml . encode ( ' utf-8 ' ) )
except PageMissingError :
logerror (
logerror (
config = config ,
config = config ,
text = u ' The page " %s " was missing in the wiki (probably deleted) ' %
text = u ' The page " %s " was missing in the wiki (probably deleted) ' %
@ -639,7 +647,6 @@ def generateXMLDump(config={}, titles=[], start='', session=None):
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
# (logged in errors log)
xmlfile . write ( xml . encode ( ' utf-8 ' ) )
c + = 1
c + = 1
xmlfile . write ( footer )
xmlfile . write ( footer )
xmlfile . close ( )
xmlfile . close ( )