|
|
@ -453,7 +453,8 @@ def cleanXML(xml=''):
|
|
|
|
return xml
|
|
|
|
return xml
|
|
|
|
|
|
|
|
|
|
|
|
def generateXMLDump(config={}, titles=[], start=''):
|
|
|
|
def generateXMLDump(config={}, titles=[], start=''):
|
|
|
|
""" """
|
|
|
|
""" Generates a XML dump for a list of titles """
|
|
|
|
|
|
|
|
|
|
|
|
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
|
|
|
|
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
|
|
|
|
header = getXMLHeader(config=config)
|
|
|
|
header = getXMLHeader(config=config)
|
|
|
|
footer = '</mediawiki>\n' #new line at the end
|
|
|
|
footer = '</mediawiki>\n' #new line at the end
|
|
|
@ -514,26 +515,29 @@ def generateXMLDump(config={}, titles=[], start=''):
|
|
|
|
|
|
|
|
|
|
|
|
def saveTitles(config={}, titles=[]):
|
|
|
|
def saveTitles(config={}, titles=[]):
|
|
|
|
""" Save title list in a file """
|
|
|
|
""" Save title list in a file """
|
|
|
|
#save titles in a txt for resume if needed
|
|
|
|
|
|
|
|
titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date'])
|
|
|
|
titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date'])
|
|
|
|
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
|
|
|
|
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
|
|
|
|
output = u"%s\n--END--" % ('\n'.join(titles))
|
|
|
|
output = u"%s\n--END--" % ('\n'.join(titles))
|
|
|
|
titlesfile.write(output.encode('utf-8'))
|
|
|
|
titlesfile.write(output.encode('utf-8'))
|
|
|
|
titlesfile.close()
|
|
|
|
titlesfile.close()
|
|
|
|
|
|
|
|
|
|
|
|
print 'Titles saved at...', titlesfilename
|
|
|
|
print 'Titles saved at...', titlesfilename
|
|
|
|
|
|
|
|
|
|
|
|
def saveImageFilenamesURL(config={}, images=[]):
|
|
|
|
def saveImageFilenamesURL(config={}, images=[]):
|
|
|
|
""" Save image list in a file """
|
|
|
|
""" Save image list in a file, including filename, url and uploader """
|
|
|
|
#save list of images and their urls
|
|
|
|
|
|
|
|
imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
|
|
|
|
imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
|
|
|
|
imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
|
|
|
|
imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
|
|
|
|
imagesfile.write('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]))
|
|
|
|
imagesfile.write('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]))
|
|
|
|
imagesfile.write('\n--END--')
|
|
|
|
imagesfile.write('\n--END--')
|
|
|
|
imagesfile.close()
|
|
|
|
imagesfile.close()
|
|
|
|
|
|
|
|
|
|
|
|
print 'Image filenames and URLs saved at...', imagesfilename
|
|
|
|
print 'Image filenames and URLs saved at...', imagesfilename
|
|
|
|
|
|
|
|
|
|
|
|
def getImageFilenamesURL(config={}):
|
|
|
|
def getImageFilenamesURL(config={}):
|
|
|
|
""" Retrieve file list: filename, url, uploader """
|
|
|
|
""" Retrieve file list: filename, url, uploader """
|
|
|
|
|
|
|
|
|
|
|
|
print 'Retrieving image filenames'
|
|
|
|
print 'Retrieving image filenames'
|
|
|
|
r_next = r'(?<!&dir=prev)&offset=(?P<offset>\d+)&' # (?<! http://docs.python.org/library/re.html
|
|
|
|
r_next = r'(?<!&dir=prev)&offset=(?P<offset>\d+)&' # (?<! http://docs.python.org/library/re.html
|
|
|
|
images = []
|
|
|
|
images = []
|
|
|
@ -617,6 +621,7 @@ def getImageFilenamesURL(config={}):
|
|
|
|
|
|
|
|
|
|
|
|
def getImageFilenamesURLAPI(config={}):
|
|
|
|
def getImageFilenamesURLAPI(config={}):
|
|
|
|
""" Retrieve file list: filename, url, uploader """
|
|
|
|
""" Retrieve file list: filename, url, uploader """
|
|
|
|
|
|
|
|
|
|
|
|
print 'Retrieving image filenames'
|
|
|
|
print 'Retrieving image filenames'
|
|
|
|
headers = {'User-Agent': getUserAgent()}
|
|
|
|
headers = {'User-Agent': getUserAgent()}
|
|
|
|
aifrom = '!'
|
|
|
|
aifrom = '!'
|
|
|
@ -675,15 +680,18 @@ def getImageFilenamesURLAPI(config={}):
|
|
|
|
|
|
|
|
|
|
|
|
def undoHTMLEntities(text=''):
|
|
|
|
def undoHTMLEntities(text=''):
|
|
|
|
""" Undo some HTML codes """
|
|
|
|
""" Undo some HTML codes """
|
|
|
|
|
|
|
|
|
|
|
|
text = re.sub('<', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp
|
|
|
|
text = re.sub('<', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp
|
|
|
|
text = re.sub('>', '>', text)
|
|
|
|
text = re.sub('>', '>', text)
|
|
|
|
text = re.sub('&', '&', text)
|
|
|
|
text = re.sub('&', '&', text)
|
|
|
|
text = re.sub('"', '"', text)
|
|
|
|
text = re.sub('"', '"', text)
|
|
|
|
text = re.sub(''', '\'', text)
|
|
|
|
text = re.sub(''', '\'', text)
|
|
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
def generateImageDump(config={}, other={}, images=[], start=''):
|
|
|
|
def generateImageDump(config={}, other={}, images=[], start=''):
|
|
|
|
""" Save files and descriptions using a file list """
|
|
|
|
""" Save files and descriptions using a file list """
|
|
|
|
|
|
|
|
|
|
|
|
#fix use subdirectories md5
|
|
|
|
#fix use subdirectories md5
|
|
|
|
print 'Retrieving images from "%s"' % (start and start or 'start')
|
|
|
|
print 'Retrieving images from "%s"' % (start and start or 'start')
|
|
|
|
imagepath = '%s/images' % (config['path'])
|
|
|
|
imagepath = '%s/images' % (config['path'])
|
|
|
@ -732,6 +740,7 @@ def generateImageDump(config={}, other={}, images=[], start=''):
|
|
|
|
c += 1
|
|
|
|
c += 1
|
|
|
|
if c % 10 == 0:
|
|
|
|
if c % 10 == 0:
|
|
|
|
print ' Downloaded %d images' % (c)
|
|
|
|
print ' Downloaded %d images' % (c)
|
|
|
|
|
|
|
|
|
|
|
|
print 'Downloaded %d images' % (c)
|
|
|
|
print 'Downloaded %d images' % (c)
|
|
|
|
|
|
|
|
|
|
|
|
def saveLogs(config={}):
|
|
|
|
def saveLogs(config={}):
|
|
|
@ -756,9 +765,9 @@ def saveLogs(config={}):
|
|
|
|
|
|
|
|
|
|
|
|
def domain2prefix(config={}):
|
|
|
|
def domain2prefix(config={}):
|
|
|
|
""" Convert domain name to a valid prefix filename. """
|
|
|
|
""" Convert domain name to a valid prefix filename. """
|
|
|
|
|
|
|
|
|
|
|
|
# At this point, both api and index are supposed to be defined
|
|
|
|
# At this point, both api and index are supposed to be defined
|
|
|
|
domain = ''
|
|
|
|
domain = ''
|
|
|
|
|
|
|
|
|
|
|
|
if config['api']:
|
|
|
|
if config['api']:
|
|
|
|
domain = config['api']
|
|
|
|
domain = config['api']
|
|
|
|
elif config['index']:
|
|
|
|
elif config['index']:
|
|
|
@ -769,10 +778,12 @@ def domain2prefix(config={}):
|
|
|
|
domain = re.sub(r'/', '_', domain)
|
|
|
|
domain = re.sub(r'/', '_', domain)
|
|
|
|
domain = re.sub(r'\.', '', domain)
|
|
|
|
domain = re.sub(r'\.', '', domain)
|
|
|
|
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
|
|
|
|
domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
|
|
|
|
|
|
|
|
|
|
|
|
return domain
|
|
|
|
return domain
|
|
|
|
|
|
|
|
|
|
|
|
def loadConfig(config={}, configfilename=''):
|
|
|
|
def loadConfig(config={}, configfilename=''):
|
|
|
|
""" Load config file """
|
|
|
|
""" Load config file """
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
f = open('%s/%s' % (config['path'], configfilename), 'r')
|
|
|
|
f = open('%s/%s' % (config['path'], configfilename), 'r')
|
|
|
|
except:
|
|
|
|
except:
|
|
|
@ -780,10 +791,12 @@ def loadConfig(config={}, configfilename=''):
|
|
|
|
sys.exit()
|
|
|
|
sys.exit()
|
|
|
|
config = cPickle.load(f)
|
|
|
|
config = cPickle.load(f)
|
|
|
|
f.close()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
return config
|
|
|
|
return config
|
|
|
|
|
|
|
|
|
|
|
|
def saveConfig(config={}, configfilename=''):
|
|
|
|
def saveConfig(config={}, configfilename=''):
|
|
|
|
""" Save config file """
|
|
|
|
""" Save config file """
|
|
|
|
|
|
|
|
|
|
|
|
f = open('%s/%s' % (config['path'], configfilename), 'w')
|
|
|
|
f = open('%s/%s' % (config['path'], configfilename), 'w')
|
|
|
|
cPickle.dump(config, f)
|
|
|
|
cPickle.dump(config, f)
|
|
|
|
f.close()
|
|
|
|
f.close()
|
|
|
@ -987,6 +1000,7 @@ def getParameters(params=[]):
|
|
|
|
|
|
|
|
|
|
|
|
def checkAPI(api, config={}):
|
|
|
|
def checkAPI(api, config={}):
|
|
|
|
""" Checking API availability """
|
|
|
|
""" Checking API availability """
|
|
|
|
|
|
|
|
|
|
|
|
req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
|
|
|
|
req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
|
|
|
|
f = urllib2.urlopen(req)
|
|
|
|
f = urllib2.urlopen(req)
|
|
|
|
result = json.loads(f.read())
|
|
|
|
result = json.loads(f.read())
|
|
|
@ -999,6 +1013,7 @@ def checkAPI(api, config={}):
|
|
|
|
|
|
|
|
|
|
|
|
def checkIndexphp(indexphp, config={}):
|
|
|
|
def checkIndexphp(indexphp, config={}):
|
|
|
|
""" Checking index.php availability """
|
|
|
|
""" Checking index.php availability """
|
|
|
|
|
|
|
|
|
|
|
|
req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
|
|
|
|
req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
|
|
|
|
f = urllib2.urlopen(req)
|
|
|
|
f = urllib2.urlopen(req)
|
|
|
|
raw = f.read()
|
|
|
|
raw = f.read()
|
|
|
@ -1014,10 +1029,12 @@ def checkIndexphp(indexphp, config={}):
|
|
|
|
|
|
|
|
|
|
|
|
def removeIP(raw=''):
|
|
|
|
def removeIP(raw=''):
|
|
|
|
""" Remove IP from HTML comments <!-- --> """
|
|
|
|
""" Remove IP from HTML comments <!-- --> """
|
|
|
|
|
|
|
|
|
|
|
|
raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
|
|
|
|
raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
|
|
|
|
#http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
|
|
|
|
#http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
|
|
|
|
#weird cases as :: are not included
|
|
|
|
#weird cases as :: are not included
|
|
|
|
raw = re.sub(r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw)
|
|
|
|
raw = re.sub(r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw)
|
|
|
|
|
|
|
|
|
|
|
|
return raw
|
|
|
|
return raw
|
|
|
|
|
|
|
|
|
|
|
|
def checkXMLIntegrity(config={}):
|
|
|
|
def checkXMLIntegrity(config={}):
|
|
|
@ -1187,7 +1204,8 @@ def resumePreviousDump(config={}, other={}):
|
|
|
|
pass
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def saveSpecialVersion(config={}):
|
|
|
|
def saveSpecialVersion(config={}):
|
|
|
|
#save Special:Version as .html, to preserve extensions details
|
|
|
|
""" Save Special:Version as .html, to preserve extensions details """
|
|
|
|
|
|
|
|
|
|
|
|
if os.path.exists('%s/Special:Version.html' % (config['path'])):
|
|
|
|
if os.path.exists('%s/Special:Version.html' % (config['path'])):
|
|
|
|
print 'Special:Version.html exists, do not overwrite'
|
|
|
|
print 'Special:Version.html exists, do not overwrite'
|
|
|
|
else:
|
|
|
|
else:
|
|
|
@ -1203,7 +1221,8 @@ def saveSpecialVersion(config={}):
|
|
|
|
f.close()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
|
|
|
|
def saveIndexPHP(config={}):
|
|
|
|
def saveIndexPHP(config={}):
|
|
|
|
#save index.php as .html, to preserve license details available at the botom of the page
|
|
|
|
""" Save index.php as .html, to preserve license details available at the botom of the page """
|
|
|
|
|
|
|
|
|
|
|
|
if os.path.exists('%s/index.html' % (config['path'])):
|
|
|
|
if os.path.exists('%s/index.html' % (config['path'])):
|
|
|
|
print 'index.html exists, do not overwrite'
|
|
|
|
print 'index.html exists, do not overwrite'
|
|
|
|
else:
|
|
|
|
else:
|
|
|
@ -1220,6 +1239,7 @@ def saveIndexPHP(config={}):
|
|
|
|
|
|
|
|
|
|
|
|
def avoidWikimediaProjects(config={}):
|
|
|
|
def avoidWikimediaProjects(config={}):
|
|
|
|
""" Skip Wikimedia projects and redirect to the dumps website """
|
|
|
|
""" Skip Wikimedia projects and redirect to the dumps website """
|
|
|
|
|
|
|
|
|
|
|
|
#notice about wikipedia dumps
|
|
|
|
#notice about wikipedia dumps
|
|
|
|
if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']):
|
|
|
|
if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']):
|
|
|
|
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
|
|
|
|
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
|
|
|
@ -1230,6 +1250,7 @@ def avoidWikimediaProjects(config={}):
|
|
|
|
|
|
|
|
|
|
|
|
def main(params=[]):
|
|
|
|
def main(params=[]):
|
|
|
|
""" Main function """
|
|
|
|
""" Main function """
|
|
|
|
|
|
|
|
|
|
|
|
welcome()
|
|
|
|
welcome()
|
|
|
|
configfilename = 'config.txt'
|
|
|
|
configfilename = 'config.txt'
|
|
|
|
config, other = getParameters(params=params)
|
|
|
|
config, other = getParameters(params=params)
|
|
|
|