@ -170,7 +170,7 @@ def getNamespacesScraper(config={}, session=None):
# [^>]*? to include selected="selected"
# [^>]*? to include selected="selected"
m = re . compile (
m = re . compile (
r ' <option [^>]*?value= " (?P<namespaceid>\ d+) " [^>]*?>(?P<namespacename>[^<]+)</option>' ) . finditer ( raw )
r ' <option [^>]*?value= [\' " ] (?P<namespaceid>\ d+) [\' " ] [^>]*?>(?P<namespacename>[^<]+)</option>' ) . finditer ( raw )
if ' all ' in namespaces :
if ' all ' in namespaces :
namespaces = [ ]
namespaces = [ ]
for i in m :
for i in m :
@ -287,9 +287,9 @@ def getPageTitlesScraper(config={}, session=None):
r_title = r ' title= " (?P<title>[^>]+) " > '
r_title = r ' title= " (?P<title>[^>]+) " > '
r_suballpages = ' '
r_suballpages = ' '
r_suballpages1 = r ' &from=(?P<from>[^> ]+)&to=(?P<to>[^>]+)" > '
r_suballpages1 = r ' &from=(?P<from>[^> " ]+)&to=(?P<to>[^>" ]+)" > '
r_suballpages2 = r ' Special:Allpages/(?P<from>[^> ]+)" > '
r_suballpages2 = r ' Special:Allpages/(?P<from>[^> " ]+)" > '
r_suballpages3 = r ' &from=(?P<from>[^> ]+)" title= " [^>]+ " > '
r_suballpages3 = r ' &from=(?P<from>[^> " ]+)" title= " [^>]+ " > '
if re . search ( r_suballpages1 , raw ) :
if re . search ( r_suballpages1 , raw ) :
r_suballpages = r_suballpages1
r_suballpages = r_suballpages1
elif re . search ( r_suballpages2 , raw ) :
elif re . search ( r_suballpages2 , raw ) :
@ -299,7 +299,7 @@ def getPageTitlesScraper(config={}, session=None):
else :
else :
pass # perhaps no subpages
pass # perhaps no subpages
# Should be enough t subpages on Special:Allpages
# Should be enough subpages on Special:Allpages
deep = 50
deep = 50
c = 0
c = 0
oldfr = ' '
oldfr = ' '
@ -321,8 +321,8 @@ def getPageTitlesScraper(config={}, session=None):
name = ' %s - %s ' % ( fr , to )
name = ' %s - %s ' % ( fr , to )
url = ' %s ?title=Special:Allpages&namespace= %s &from= %s &to= %s ' % (
url = ' %s ?title=Special:Allpages&namespace= %s &from= %s &to= %s ' % (
config [ ' index ' ] , namespace , fr , to ) # do not put urllib.quote in fr or to
config [ ' index ' ] , namespace , fr , to ) # do not put urllib.quote in fr or to
# fix, esta regexp no carga bien todas? o falla el r_title en
# fix, this regexp doesn't properly save everything? or does r_title fail on this
# este tipo de subpag ? (wikiindex)
# type of subpage ? (wikiindex)
elif r_suballpages == r_suballpages2 :
elif r_suballpages == r_suballpages2 :
# clean &namespace=\d, sometimes happens
# clean &namespace=\d, sometimes happens
fr = fr . split ( ' &namespace= ' ) [ 0 ]
fr = fr . split ( ' &namespace= ' ) [ 0 ]
@ -1519,7 +1519,7 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
title = u ' Image: %s ' % ( filename )
title = u ' Image: %s ' % ( filename )
try :
try :
if config [ ' xmlrevisions ' ] and config [ ' api ' ] and config [ ' api ' ] . endswith ( " api.php " ) :
if config [ ' xmlrevisions ' ] and config [ ' api ' ] and config [ ' api ' ] . endswith ( " api.php " ) :
r = session . get ( config [ ' api ' ] + u " ?action=query&export&exportnowrap&titles= %s " % title)
r = session . get ( config [ ' api ' ] + u " ?action=query&export&exportnowrap&titles= %s " % urllib. parse . quote ( title) )
xmlfiledesc = r . text
xmlfiledesc = r . text
else :
else :
xmlfiledesc = getXMLFileDesc (
xmlfiledesc = getXMLFileDesc (
@ -2250,7 +2250,7 @@ def resumePreviousDump(config={}, other={}):
listdir = os . listdir ( ' %s /images ' % ( config [ ' path ' ] ) )
listdir = os . listdir ( ' %s /images ' % ( config [ ' path ' ] ) )
except :
except :
pass # probably directory does not exist
pass # probably directory does not exist
listdir . sort ( )
listdir = set ( listdir )
complete = True
complete = True
lastfilename = ' '
lastfilename = ' '
lastfilename2 = ' '
lastfilename2 = ' '
@ -2528,7 +2528,7 @@ def main(params=[]):
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
print ' \n Warning!: " %s " path exists ' % ( config [ ' path ' ] )
reply = ' '
reply = ' '
if config [ ' failfast ' ] :
if config [ ' failfast ' ] :
re tr y = ' yes '
re pl y = ' yes '
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
while reply . lower ( ) not in [ ' yes ' , ' y ' , ' no ' , ' n ' ] :
reply = raw_input (
reply = raw_input (
' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], [no, n])? ' %
' There is a dump in " %s " , probably incomplete. \n If you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored \n and the parameters available in " %s / %s " will be loaded. \n Do you want to resume ([yes, y], [no, n])? ' %