comments and newlines

pull/119/head
Emilio J. Rodríguez-Posada 10 years ago
parent 5eff4bd072
commit d395433513

@ -453,7 +453,8 @@ def cleanXML(xml=''):
return xml return xml
def generateXMLDump(config={}, titles=[], start=''): def generateXMLDump(config={}, titles=[], start=''):
""" """ """ Generates a XML dump for a list of titles """
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start') print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header = getXMLHeader(config=config) header = getXMLHeader(config=config)
footer = '</mediawiki>\n' #new line at the end footer = '</mediawiki>\n' #new line at the end
@ -514,26 +515,29 @@ def generateXMLDump(config={}, titles=[], start=''):
def saveTitles(config={}, titles=[]): def saveTitles(config={}, titles=[]):
""" Save title list in a file """ """ Save title list in a file """
#save titles in a txt for resume if needed
titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date']) titlesfilename = '%s-%s-titles.txt' % (domain2prefix(config=config), config['date'])
titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w') titlesfile = open('%s/%s' % (config['path'], titlesfilename), 'w')
output = u"%s\n--END--" % ('\n'.join(titles)) output = u"%s\n--END--" % ('\n'.join(titles))
titlesfile.write(output.encode('utf-8')) titlesfile.write(output.encode('utf-8'))
titlesfile.close() titlesfile.close()
print 'Titles saved at...', titlesfilename print 'Titles saved at...', titlesfilename
def saveImageFilenamesURL(config={}, images=[]): def saveImageFilenamesURL(config={}, images=[]):
""" Save image list in a file """ """ Save image list in a file, including filename, url and uploader """
#save list of images and their urls
imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date']) imagesfilename = '%s-%s-images.txt' % (domain2prefix(config=config), config['date'])
imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w') imagesfile = open('%s/%s' % (config['path'], imagesfilename), 'w')
imagesfile.write('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images])) imagesfile.write('\n'.join(['%s\t%s\t%s' % (filename, url, uploader) for filename, url, uploader in images]))
imagesfile.write('\n--END--') imagesfile.write('\n--END--')
imagesfile.close() imagesfile.close()
print 'Image filenames and URLs saved at...', imagesfilename print 'Image filenames and URLs saved at...', imagesfilename
def getImageFilenamesURL(config={}): def getImageFilenamesURL(config={}):
""" Retrieve file list: filename, url, uploader """ """ Retrieve file list: filename, url, uploader """
print 'Retrieving image filenames' print 'Retrieving image filenames'
r_next = r'(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;' # (?<! http://docs.python.org/library/re.html r_next = r'(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;' # (?<! http://docs.python.org/library/re.html
images = [] images = []
@ -617,6 +621,7 @@ def getImageFilenamesURL(config={}):
def getImageFilenamesURLAPI(config={}): def getImageFilenamesURLAPI(config={}):
""" Retrieve file list: filename, url, uploader """ """ Retrieve file list: filename, url, uploader """
print 'Retrieving image filenames' print 'Retrieving image filenames'
headers = {'User-Agent': getUserAgent()} headers = {'User-Agent': getUserAgent()}
aifrom = '!' aifrom = '!'
@ -675,15 +680,18 @@ def getImageFilenamesURLAPI(config={}):
def undoHTMLEntities(text=''): def undoHTMLEntities(text=''):
""" Undo some HTML codes """ """ Undo some HTML codes """
text = re.sub('&lt;', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp text = re.sub('&lt;', '<', text) # i guess only < > & " ' need conversion http://www.w3schools.com/html/html_entities.asp
text = re.sub('&gt;', '>', text) text = re.sub('&gt;', '>', text)
text = re.sub('&amp;', '&', text) text = re.sub('&amp;', '&', text)
text = re.sub('&quot;', '"', text) text = re.sub('&quot;', '"', text)
text = re.sub('&#039;', '\'', text) text = re.sub('&#039;', '\'', text)
return text return text
def generateImageDump(config={}, other={}, images=[], start=''): def generateImageDump(config={}, other={}, images=[], start=''):
""" Save files and descriptions using a file list """ """ Save files and descriptions using a file list """
#fix use subdirectories md5 #fix use subdirectories md5
print 'Retrieving images from "%s"' % (start and start or 'start') print 'Retrieving images from "%s"' % (start and start or 'start')
imagepath = '%s/images' % (config['path']) imagepath = '%s/images' % (config['path'])
@ -732,6 +740,7 @@ def generateImageDump(config={}, other={}, images=[], start=''):
c += 1 c += 1
if c % 10 == 0: if c % 10 == 0:
print ' Downloaded %d images' % (c) print ' Downloaded %d images' % (c)
print 'Downloaded %d images' % (c) print 'Downloaded %d images' % (c)
def saveLogs(config={}): def saveLogs(config={}):
@ -756,9 +765,9 @@ def saveLogs(config={}):
def domain2prefix(config={}): def domain2prefix(config={}):
""" Convert domain name to a valid prefix filename. """ """ Convert domain name to a valid prefix filename. """
# At this point, both api and index are supposed to be defined # At this point, both api and index are supposed to be defined
domain = '' domain = ''
if config['api']: if config['api']:
domain = config['api'] domain = config['api']
elif config['index']: elif config['index']:
@ -769,10 +778,12 @@ def domain2prefix(config={}):
domain = re.sub(r'/', '_', domain) domain = re.sub(r'/', '_', domain)
domain = re.sub(r'\.', '', domain) domain = re.sub(r'\.', '', domain)
domain = re.sub(r'[^A-Za-z0-9]', '_', domain) domain = re.sub(r'[^A-Za-z0-9]', '_', domain)
return domain return domain
def loadConfig(config={}, configfilename=''): def loadConfig(config={}, configfilename=''):
""" Load config file """ """ Load config file """
try: try:
f = open('%s/%s' % (config['path'], configfilename), 'r') f = open('%s/%s' % (config['path'], configfilename), 'r')
except: except:
@ -780,10 +791,12 @@ def loadConfig(config={}, configfilename=''):
sys.exit() sys.exit()
config = cPickle.load(f) config = cPickle.load(f)
f.close() f.close()
return config return config
def saveConfig(config={}, configfilename=''): def saveConfig(config={}, configfilename=''):
""" Save config file """ """ Save config file """
f = open('%s/%s' % (config['path'], configfilename), 'w') f = open('%s/%s' % (config['path'], configfilename), 'w')
cPickle.dump(config, f) cPickle.dump(config, f)
f.close() f.close()
@ -987,6 +1000,7 @@ def getParameters(params=[]):
def checkAPI(api, config={}): def checkAPI(api, config={}):
""" Checking API availability """ """ Checking API availability """
req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent()})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
result = json.loads(f.read()) result = json.loads(f.read())
@ -999,6 +1013,7 @@ def checkAPI(api, config={}):
def checkIndexphp(indexphp, config={}): def checkIndexphp(indexphp, config={}):
""" Checking index.php availability """ """ Checking index.php availability """
req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()}) req = urllib2.Request(url=indexphp, data=urllib.urlencode({'title': 'Special:Version', }), headers={'User-Agent': getUserAgent()})
f = urllib2.urlopen(req) f = urllib2.urlopen(req)
raw = f.read() raw = f.read()
@ -1014,10 +1029,12 @@ def checkIndexphp(indexphp, config={}):
def removeIP(raw=''): def removeIP(raw=''):
""" Remove IP from HTML comments <!-- --> """ """ Remove IP from HTML comments <!-- --> """
raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw) raw = re.sub(r'\d+\.\d+\.\d+\.\d+', '0.0.0.0', raw)
#http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html #http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
#weird cases as :: are not included #weird cases as :: are not included
raw = re.sub(r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw) raw = re.sub(r'(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}', '0:0:0:0:0:0:0:0', raw)
return raw return raw
def checkXMLIntegrity(config={}): def checkXMLIntegrity(config={}):
@ -1187,7 +1204,8 @@ def resumePreviousDump(config={}, other={}):
pass pass
def saveSpecialVersion(config={}): def saveSpecialVersion(config={}):
#save Special:Version as .html, to preserve extensions details """ Save Special:Version as .html, to preserve extensions details """
if os.path.exists('%s/Special:Version.html' % (config['path'])): if os.path.exists('%s/Special:Version.html' % (config['path'])):
print 'Special:Version.html exists, do not overwrite' print 'Special:Version.html exists, do not overwrite'
else: else:
@ -1203,7 +1221,8 @@ def saveSpecialVersion(config={}):
f.close() f.close()
def saveIndexPHP(config={}): def saveIndexPHP(config={}):
#save index.php as .html, to preserve license details available at the botom of the page """ Save index.php as .html, to preserve license details available at the botom of the page """
if os.path.exists('%s/index.html' % (config['path'])): if os.path.exists('%s/index.html' % (config['path'])):
print 'index.html exists, do not overwrite' print 'index.html exists, do not overwrite'
else: else:
@ -1220,6 +1239,7 @@ def saveIndexPHP(config={}):
def avoidWikimediaProjects(config={}): def avoidWikimediaProjects(config={}):
""" Skip Wikimedia projects and redirect to the dumps website """ """ Skip Wikimedia projects and redirect to the dumps website """
#notice about wikipedia dumps #notice about wikipedia dumps
if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']): if re.findall(r'(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org', config['api']+config['index']):
print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!' print 'PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!'
@ -1230,6 +1250,7 @@ def avoidWikimediaProjects(config={}):
def main(params=[]): def main(params=[]):
""" Main function """ """ Main function """
welcome() welcome()
configfilename = 'config.txt' configfilename = 'config.txt'
config, other = getParameters(params=params) config, other = getParameters(params=params)

Loading…
Cancel
Save