diff --git a/dumpgenerator.py b/dumpgenerator.py index ecc1ae5..bb420d8 100755 --- a/dumpgenerator.py +++ b/dumpgenerator.py @@ -223,7 +223,7 @@ def getNamespacesAPI(config={}, session=None): return namespaces, namespacenames -def mwGetPageTitlesAPI(config={}, session=None): +def getPageTitlesAPI(config={}, session=None): """ Uses the API to get the list of page titles """ titles = [] namespaces, namespacenames = getNamespacesAPI( @@ -292,7 +292,7 @@ def mwGetPageTitlesAPI(config={}, session=None): delay(config=config, session=session) print ' %d titles retrieved in the namespace %d' % (c, namespace) -def mwGetPageTitlesScraper(config={}, session=None): +def getPageTitlesScraper(config={}, session=None): """ Scrape the list of page titles from Special:Allpages """ titles = [] namespaces, namespacenames = getNamespacesScraper( @@ -376,16 +376,7 @@ def mwGetPageTitlesScraper(config={}, session=None): return titles -def wsGetPageTitles(config={}, session=None): - """ Get list of page titles """ - - titles = [] - - - - return titles - -def mwGetPageTitles(config={}, session=None): +def getPageTitles(config={}, session=None): """ Get list of page titles """ # http://en.wikipedia.org/wiki/Special:AllPages # http://archiveteam.org/index.php?title=Special:AllPages @@ -399,11 +390,11 @@ def mwGetPageTitles(config={}, session=None): test = getJSON(r) if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages'] and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'): - titles = mwGetPageTitlesScraper(config=config, session=session) + titles = getPageTitlesScraper(config=config, session=session) else: - titles = mwGetPageTitlesAPI(config=config, session=session) + titles = getPageTitlesAPI(config=config, session=session) elif 'index' in config and config['index']: - titles = mwGetPageTitlesScraper(config=config, session=session) + titles = getPageTitlesScraper(config=config, session=session) titlesfilename = '%s-%s-titles.txt' % ( domain2prefix(config=config), config['date']) @@ -1268,7 +1259,7 @@ def getParameters(params=[]): '--retries', metavar=5, default=5, - help="maximum number of retries") + help="Maximum number of retries for ") parser.add_argument('--path', help='path to store wiki dump at') parser.add_argument( '--resume', @@ -1276,11 +1267,11 @@ def getParameters(params=[]): help='resumes previous incomplete dump (requires --path)') parser.add_argument('--force', action='store_true', help='') parser.add_argument( - '--user', help='username if authentication is required') + '--user', help='Username if authentication is required.') parser.add_argument( '--pass', dest='password', - help='password if authentication is required') + help='Password if authentication is required.') # URL params groupWikiOrAPIOrIndex = parser.add_argument_group() @@ -1308,8 +1299,6 @@ def getParameters(params=[]): help='store only the current version of pages') groupDownload.add_argument( '--images', action='store_true', help="generates an image dump") - groupDownload.add_argument( - '--pages', action='store_true', help="generates a page dump") groupDownload.add_argument( '--namespaces', metavar="1,2,3", @@ -1371,79 +1360,80 @@ def getParameters(params=[]): print 'ERROR: URLs must start with http:// or https://\n' parser.print_help() sys.exit(1) - - if getWikiEngine(args.wiki) == 'wikispaces': - pass - else: # presume is a mediawiki - # Get API and index and verify - api = args.api and args.api or '' - index = args.index and args.index or '' - if api == '' or index == '': - if args.wiki: + + # Get API and index and verify + api = args.api and args.api or '' + index = args.index and args.index or '' + if api == '' or index == '': + if args.wiki: + if getWikiEngine(args.wiki) == 'MediaWiki': api2, index2 = mwGetAPIAndIndex(args.wiki) if not api: api = api2 if not index: index = index2 else: - if api == '': - pass - elif index == '': - index = '/'.join(api.split('/')[:-1]) + '/index.php' + print 'ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki' + sys.exit(1) + else: + if api == '': + pass + elif index == '': + index = '/'.join(api.split('/')[:-1]) + '/index.php' - # print api - # print index - index2 = None + # print api + # print index + index2 = None - if api: - retry = 0 - maxretries = args.retries - retrydelay = 20 - while retry < maxretries: - try: - check = checkAPI(api=api, session=session) - break - except requests.exceptions.ConnectionError as e: - print 'Connection error: %s'%(str(e)) - retry += 1 - print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay) - time.sleep(retrydelay) - if api and check: - index2 = check[1] - api = check[2] - print 'API is OK: ' + api + if api: + retry = 0 + maxretries = args.retries + retrydelay = 20 + while retry < maxretries: + try: + check = checkAPI(api=api, session=session) + break + except requests.exceptions.ConnectionError as e: + print 'Connection error: %s'%(str(e)) + retry += 1 + print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay) + time.sleep(retrydelay) + if api and check: + index2 = check[1] + api = check[2] + print 'API is OK: ' + api + else: + if index and not args.wiki: + print 'API not available. Trying with index.php only.' else: - if index and not args.wiki: - print 'API not available. Trying with index.php only.' - else: - print 'Error in API. Please, provide a correct path to API' - sys.exit(1) + print 'Error in API. Please, provide a correct path to API' + sys.exit(1) + if index and checkIndex( + index=index, + cookies=args.cookies, + session=session): + print 'index.php is OK' + else: + index = index2 + if index and index.startswith('//'): + index = args.wiki.split('//')[0] + index if index and checkIndex( index=index, cookies=args.cookies, session=session): print 'index.php is OK' else: - index = index2 - if index and index.startswith('//'): - index = args.wiki.split('//')[0] + index + index = '/'.join(index.split('/')[:-1]) if index and checkIndex( index=index, cookies=args.cookies, session=session): print 'index.php is OK' else: - index = '/'.join(index.split('/')[:-1]) - if index and checkIndex( - index=index, - cookies=args.cookies, - session=session): - print 'index.php is OK' - else: - print 'Error in index.php, please, provide a correct path to index.php' - sys.exit(1) - + print 'Error in index.php, please, provide a correct path to index.php' + sys.exit(1) + # check user and pass (one requires both) if (args.user and not args.password) or (args.password and not args.user): print 'ERROR: Both --user and --pass are required for authentication.' @@ -1487,15 +1477,13 @@ def getParameters(params=[]): sys.exit(1) config = { - 'wikiengine': getWikiEngine(args.wiki), 'curonly': args.curonly, 'date': datetime.datetime.now().strftime('%Y%m%d'), 'api': api, 'index': index, 'images': args.images, - 'pages': args.pages, 'logs': False, - 'xml': args.xml, #this should be 'pages'? (and modify in all the script). Xml is mediawiki-centric, other wikis dont export in XML + 'xml': args.xml, 'namespaces': namespaces, 'exnamespaces': exnamespaces, 'path': args.path and os.path.normpath(args.path) or '', @@ -1651,33 +1639,10 @@ def checkXMLIntegrity(config={}, titles=[], session=None): def createNewDump(config={}, other={}): - if config['wikiengine'] == 'mediawiki': - mwCreateNewDump(config=config, other=other) - elif config['wikiengine'] == 'wikispaces': - wsCreateNewDump(config=config, other=other) - -def wsCreateNewDump(config={}, other={}): - print 'Trying generating a new dump into a new directory...' - if config['pages']: - pages = wsGetPageTitles(config=config, session=other['session']) - wsSavePageTitles(config=config, pages=pages) - generatePageDump(config=config, pages=pages, session=other['session']) - if config['images']: - images = wsGetImageNames(config=config, session=other['session']) - wsSaveImageNames(config=config, images=images) - generateImageDump( - config=config, - other=other, - images=images, - session=other['session']) - if config['logs']: - wsSaveLogs(config=config, session=other['session']) - -def mwCreateNewDump(config={}, other={}): images = [] print 'Trying generating a new dump into a new directory...' if config['xml']: - mwGetPageTitles(config=config, session=other['session']) + getPageTitles(config=config, session=other['session']) titles=readTitles(config) generateXMLDump(config=config, titles=titles, session=other['session']) checkXMLIntegrity( @@ -1695,16 +1660,8 @@ def mwCreateNewDump(config={}, other={}): if config['logs']: saveLogs(config=config, session=other['session']) -def resumePreviousDump(config={}, other={}): - if config['wikiengine'] == 'mediawiki': - mwResumePreviousDump(config=config, other=other) - elif config['wikiengine'] == 'wikispaces': - wsResumePreviousDump(config=config, other=other) - -def wsResumePreviousDump(config={}, other={}): - pass -def mwResumePreviousDump(config={}, other={}): +def resumePreviousDump(config={}, other={}): images = [] print 'Resuming previous dump process...' if config['xml']: @@ -1727,7 +1684,7 @@ def mwResumePreviousDump(config={}, other={}): print 'Title list is incomplete. Reloading...' # do not resume, reload, to avoid inconsistences, deleted pages or # so - mwGetPageTitles(config=config, session=other['session']) + getPageTitles(config=config, session=other['session']) # checking xml dump xmliscomplete = False @@ -1932,88 +1889,85 @@ def avoidWikimediaProjects(config={}, other={}): def getWikiEngine(url=''): """ Returns the wiki engine of a URL, if known """ - - wikiengine = 'unknown' - if url: - session = requests.Session() - session.headers.update({'User-Agent': getUserAgent()}) - r = session.post(url=url) - if r.status_code == 405 or r.text == '': - r = session.get(url=url) - result = r.text - else: - return wikiengine.lower() - + + session = requests.Session() + session.headers.update({'User-Agent': getUserAgent()}) + r = session.post(url=url) + if r.status_code == 405 or r.text == '': + r = session.get(url=url) + result = r.text + + wikiengine = 'Unknown' if re.search( ur'(?im)()', result): - wikiengine = 'moinmoin' + wikiengine = 'MoinMoin' elif re.search(ur'(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)', result): - wikiengine = 'twiki' + wikiengine = 'TWiki' elif re.search(ur'(?im)()', result): - wikiengine = 'pmwiki' + wikiengine = 'PmWiki' elif re.search(ur'(?im)(|)', result): - wikiengine = 'wagn' + wikiengine = 'Wagn' elif re.search(ur'(?im)(\s*(

)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result): - wikiengine = 'jspwiki' + wikiengine = 'JSPWiki' elif re.search(ur'(?im)(Powered by:?\s*(
)?\s*|\bKwikiNavigation\b)', result): - wikiengine = 'kwiki' + wikiengine = 'Kwiki' elif re.search(ur'(?im)(Powered by )', result): - wikiengine = 'zwiki' + wikiengine = 'Zwiki' # WakkaWiki forks elif re.search(ur'(?im)()', result): - wikiengine = 'wikkawiki' # formerly WikkaWakkaWiki + wikiengine = 'WikkaWiki' # formerly WikkaWakkaWiki elif re.search(ur'(?im)(CitiWiki)', result): - wikiengine = 'citiwiki' + wikiengine = 'CitiWiki' elif re.search(ur'(?im)(Powered by |wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', result): - wikiengine = 'wikidot' + wikiengine = 'Wikidot' elif re.search(ur'(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)', result): - wikiengine = 'wetpaint' + wikiengine = 'Wetpaint' elif re.search(ur'(?im)(