Initial port to argparse

pull/138/head
balr0g 10 years ago
parent 09be8b5f22
commit 50b011f90d

@ -22,7 +22,7 @@
import cookielib
import cPickle
import datetime
import getopt
import argparse
import json
import gzip
try:
@ -40,8 +40,8 @@ import urllib2
__VERSION__ = '0.2.1'
def printVersion():
print __VERSION__
def getVersion():
return(__VERSION__)
# This class is from https://github.com/crustymonkey/py-sonic/blob/master/libsonic/connection.py#L50
class POSTHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
@ -191,7 +191,7 @@ def getPageTitlesAPI(config={}):
f = urllib2.urlopen(req)
except:
try:
print 'Server is slow... Waiting some seconds and retrying...'
print '(1) Server is slow... Waiting some seconds and retrying...'
time.sleep(10)
f = urllib2.urlopen(req)
except:
@ -387,7 +387,7 @@ def getXMLPageCore(headers={}, params={}, config={}):
f = urllib2.urlopen(req)
except:
try:
print 'Server is slow... Waiting some seconds and retrying...'
print '(2) Server is slow... Waiting some seconds and retrying...'
time.sleep(15)
f = urllib2.urlopen(req)
except:
@ -661,7 +661,7 @@ def getImageFilenamesURLAPI(config={}):
f = urllib2.urlopen(req)
except:
try:
print 'Server is slow... Waiting some seconds and retrying...'
print '(3) Server is slow... Waiting some seconds and retrying...'
time.sleep(10)
f = urllib2.urlopen(req)
except:
@ -834,14 +834,20 @@ def saveConfig(config={}, configfilename=''):
f.close()
def welcome():
message = ''
""" Opening message """
print "#"*73
print """# Welcome to DumpGenerator 0.2 by WikiTeam (GPL v3) #
message += "#"*73
message += '\n'
message += """# Welcome to DumpGenerator 0.2 by WikiTeam (GPL v3) #
# More info at: https://github.com/WikiTeam/wikiteam #"""
print "#"*73
print ''
print "#"*73
print """# Copyright (C) 2011-2014 WikiTeam #
message += "\n"
message += "#"*73
message += "\n"
message += ''
message += "\n"
message += "#"*73
message += "\n"
message += """# Copyright (C) 2011-2014 WikiTeam #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
@ -854,8 +860,12 @@ def welcome():
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <http://www.gnu.org/licenses/>. #"""
print "#"*73
print ''
message += "\n"
message += "#"*73
message += "\n"
message += ''
return message
def bye():
""" Closing message """
@ -864,142 +874,107 @@ def bye():
print "If this is a public wiki, please, consider publishing this dump. Do it yourself as explained in https://github.com/WikiTeam/wikiteam/wiki/New-Tutorial#Publishing_the_dump or contact us at https://github.com/WikiTeam/wikiteam"
print "Good luck! Bye!"
def usage():
""" """
print """Error. You forget mandatory parameters:
--api or --index: URL to api.php or to index.php, one of them. Examples: --api=http://archiveteam.org/api.php or --index=http://archiveteam.org/index.php
And one of these at least:
--xml: It generates a XML dump. It retrieves full history of all pages (if you want only the current version use --xml --curonly)
If you want filter by namespace, use the parameter --namespaces=0,1,2,3...
--images: It generates an image dump
You can resume previous incomplete dumps:
--resume: It resumes previous incomplete dump. When using --resume, --path is mandatory (path to directory where incomplete dump is).
def getParameters(params=[]):
if not params:
params = sys.argv
You can exclude namespaces:
--exnamespaces: Write the number of the namespaces you want to exclude, split by commas.
parser = argparse.ArgumentParser(description=welcome())
parser.add_argument('-v', '--version', action='version', version=(params[0] + " version " + getVersion()))
parser.add_argument('--cookies', metavar="cookies.txt", help="Path to a cookies.txt file.")
parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)")
groupAPIOrIndex = parser.add_mutually_exclusive_group(required=True)
groupAPIOrIndex.add_argument('--api', help="URL to api.php.")
groupAPIOrIndex.add_argument('--index', help="URL to index.php.")
groupXMLOrImages = parser.add_argument_group()
groupXMLOrImages.add_argument('--xml', action='store_true', help="Generates an XML dump. Retrieves full history of all pages (if you want only the current version use --xml --curonly)")
parser.add_argument('--curonly', action='store_true', help='Store only the current version of pages.')
You can use authenticaton cookies from a Mozilla cookies.txt file:
--cookies: Path to a cookies.txt file. Example: --cookies=$HOME/.netscape/cookies.txt
groupXMLOrImages.add_argument('--images', action='store_true', help="Generates an image dump")
parser.add_argument('--path', help='Path to store wiki dump at.')
parser.add_argument('--resume', action='store_true', help='Resumes previous incomplete dump. Requires --path.')
parser.add_argument('--force', action='store_true')
parser.add_argument('--namespaces', metavar="1,2,3", help='Comma-separated value of namespaces to include (all by default)')
parser.add_argument('--exnamespaces', metavar="1,2,3", help='Comma-separated value of namespaces to exclude')
args = parser.parse_args()
# check API URL
if args.api and (not args.api.startswith('http://') and not args.api.startswith('https://')):
print 'api.php must start with http:// or https://\n'
parser.print_usage()
sys.exit(1)
# check index URL
if args.index and (not args.index.startswith('http://') and not args.index.startswith('https://')):
print 'index.php must start with http:// or https://\n'
parser.print_usage()
sys.exit(1)
namespaces = ['all']
exnamespaces = []
# Process namespace inclusions
if args.namespaces:
if re.search(r'[^\d, \-]', args.namespaces) and args.namespaces.lower() != 'all': #fix, why - ? and... --namespaces= all with a space works?
print "Invalid namespace values.\nValid format is integer(s) separated by commas"
sys.exit()
else:
ns = re.sub(' ', '', args.namespaces)
if ns.lower() == 'all':
namespaces = ['all']
else:
namespaces = [int(i) for i in ns.split(',')]
You can be nice with servers using a delay:
--delay: It adds a delay (in seconds, adding 5 seconds between requests: --delay=5)
# Process namespace exclusions
if args.exnamespaces:
if re.search(r'[^\d, \-]', args.exnamespaces):
print "Invalid namespace values.\nValid format is integer(s) separated by commas"
sys.exit(1)
else:
ns = re.sub(' ', '', args.exnamespaces)
if ns.lower() == 'all':
print 'You cannot exclude all namespaces.'
sys.exit(1)
else:
exnamespaces = [int(i) for i in ns.split(',')]
Write --help for help."""
# --curonly requires --xml
if args.curonly and not args.xml:
print "--curonly requires --xml\n"
parser.print_usage()
sys.exit(1)
#user chose --api, but --index it is necessary for special:export: we generate it
if args.api and not args.index:
index = args.api.split('api.php')[0] + 'index.php'
# WARNING: remove index.php here for misconfigured sites like editthis.info, or provide --index directly
print 'You didn\'t provide a path for index.php, using ', index
else:
index = args.index
def getParameters(params=[]):
if not params:
params = sys.argv[1:]
config = {
'curonly': False,
'curonly': args.curonly,
'date': datetime.datetime.now().strftime('%Y%m%d'),
'api': '',
'index': '',
'images': False,
'api': args.api or '',
'index': index,
'images': args.images,
'logs': False,
'xml': False,
'namespaces': ['all'],
'exnamespaces': [],
'path': '',
'cookies': '',
'delay': 0,
'xml': args.xml,
'namespaces': namespaces,
'exnamespaces': exnamespaces,
'path': args.path or '',
'cookies': args.cookies or '',
'delay': args.delay
}
other = {
'resume': False,
'resume': args.resume,
'filenamelimit': 100, #do not change
'force': False,
'force': args.force,
}
#console params
try:
opts, args = getopt.getopt(params, "", ["h", "help", "path=", "api=", "index=", "images", "logs", "xml", "curonly", "resume", "cookies=", "delay=", "namespaces=", "exnamespaces=", "force", "v", "version"])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
for o, a in opts:
if o in ("-h","--help"):
usage()
sys.exit()
elif o in ("-v","--version"):
printVersion()
elif o in ("--path"):
config["path"] = a
while len(config["path"])>0:
if config["path"][-1] == '/': #dará problemas con rutas windows?
config["path"] = config["path"][:-1]
else:
break
elif o in ("--api"):
if not a.startswith('http://') and not a.startswith('https://'):
print 'api.php must start with http:// or https://'
sys.exit()
config['api'] = a
elif o in ("--index"):
if not a.startswith('http://') and not a.startswith('https://'):
print 'index.php must start with http:// or https://'
sys.exit()
config["index"] = a
elif o in ("--images"):
config["images"] = True
elif o in ("--logs"):
config["logs"] = True
elif o in ("--xml"):
config["xml"] = True
elif o in ("--curonly"):
if not config["xml"]:
print "If you select --curonly, you must use --xml too"
sys.exit()
config["curonly"] = True
elif o in ("--resume"):
other["resume"] = True
elif o in ("--cookies"):
config["cookies"] = a
elif o in ("--delay"):
config["delay"] = int(a)
elif o in ("--namespaces"):
if re.search(r'[^\d, \-]', a) and a.lower() != 'all': #fix, why - ? and... --namespaces= all with a space works?
print "Invalid namespaces values.\nValid format is integer(s) splitted by commas"
sys.exit()
a = re.sub(' ', '', a)
if a.lower() == 'all':
config["namespaces"] = ['all']
else:
config["namespaces"] = [int(i) for i in a.split(',')]
elif o in ("--exnamespaces"):
if re.search(r'[^\d, \-]', a):
print "Invalid exnamespaces values.\nValid format is integer(s) splitted by commas"
sys.exit()
a = re.sub(' ', '', a)
if a.lower() == 'all':
print 'You have excluded all namespaces. Error.'
sys.exit()
else:
config["exnamespaces"] = [int(i) for i in a.split(',')]
elif o in ("--force"):
other["force"] = True
else:
assert False, "unhandled option"
#missing mandatory params
#(config['index'] and not re.search('/index\.php', config['index'])) or \ # in EditThis there is no index.php, it is empty editthis.info/mywiki/?title=...
if (not config['api'] and not config['index']) or \
(config['api'] and not re.search('/api\.php', config['api'])) or \
not (config["xml"] or config["images"] or config["logs"]) or \
(other['resume'] and not config['path']):
usage()
sys.exit()
#override redirect handler to properly handle POSTs with redirect
opener = urllib2.build_opener(POSTHTTPRedirectHandler)
urllib2.install_opener(opener)
#user chose --api, but --index it is necessary for special:export: we generate it
if config['api'] and not config['index']:
config['index'] = config['api'].split('api.php')[0] + 'index.php'
# WARNING: remove index.php here for misconfigured sites like editthis.info, or provide --index directly
print 'You didn\'t provide a path for index.php, we try this one:', config['index']
if config['cookies']:
cj = cookielib.MozillaCookieJar()
@ -1007,7 +982,7 @@ def getParameters(params=[]):
opener = urllib2.build_opener(POSTHTTPRedirectHandler, urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
print 'Using cookies from %s' % config['cookies']
if config['api']:
#check api.php
if checkAPI(config['api'], config):
@ -1024,12 +999,13 @@ def getParameters(params=[]):
print 'Error in index.php, please, provide a correct path to index.php'
sys.exit()
#calculating path, if not defined by user with --path=
if not config['path']:
config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date'])
return config, other
return config, other
def checkAPI(api, config={}):
""" Checking API availability """
req = urllib2.Request(url=api, data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'format': 'json'}), headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
@ -1303,7 +1279,7 @@ def saveSiteInfo(config={}):
f.write(json.dumps(result, indent=4, sort_keys=True))
f.close()
def avoidWikimediaProjects(config={}):
def avoidWikimediaProjects(config={}, other={}):
""" Skip Wikimedia projects and redirect to the dumps website """
#notice about wikipedia dumps
@ -1317,10 +1293,10 @@ def avoidWikimediaProjects(config={}):
def main(params=[]):
""" Main function """
welcome()
print welcome()
configfilename = 'config.txt'
config, other = getParameters(params=params)
avoidWikimediaProjects(config=config)
avoidWikimediaProjects(config=config, other=other)
print 'Analysing %s' % (config['api'] and config['api'] or config['index'])
#creating path or resuming if desired

Loading…
Cancel
Save