restoring dumpgenerator.py code to f43b7389a0 last stable version. I will rewrite code in wikiteam/ subdirectory

pull/287/head
emijrp 8 years ago
parent 2783e1cecb
commit 3f697dbb5b

@ -223,7 +223,7 @@ def getNamespacesAPI(config={}, session=None):
return namespaces, namespacenames return namespaces, namespacenames
def mwGetPageTitlesAPI(config={}, session=None): def getPageTitlesAPI(config={}, session=None):
""" Uses the API to get the list of page titles """ """ Uses the API to get the list of page titles """
titles = [] titles = []
namespaces, namespacenames = getNamespacesAPI( namespaces, namespacenames = getNamespacesAPI(
@ -292,7 +292,7 @@ def mwGetPageTitlesAPI(config={}, session=None):
delay(config=config, session=session) delay(config=config, session=session)
print ' %d titles retrieved in the namespace %d' % (c, namespace) print ' %d titles retrieved in the namespace %d' % (c, namespace)
def mwGetPageTitlesScraper(config={}, session=None): def getPageTitlesScraper(config={}, session=None):
""" Scrape the list of page titles from Special:Allpages """ """ Scrape the list of page titles from Special:Allpages """
titles = [] titles = []
namespaces, namespacenames = getNamespacesScraper( namespaces, namespacenames = getNamespacesScraper(
@ -376,16 +376,7 @@ def mwGetPageTitlesScraper(config={}, session=None):
return titles return titles
def wsGetPageTitles(config={}, session=None): def getPageTitles(config={}, session=None):
""" Get list of page titles """
titles = []
return titles
def mwGetPageTitles(config={}, session=None):
""" Get list of page titles """ """ Get list of page titles """
# http://en.wikipedia.org/wiki/Special:AllPages # http://en.wikipedia.org/wiki/Special:AllPages
# http://archiveteam.org/index.php?title=Special:AllPages # http://archiveteam.org/index.php?title=Special:AllPages
@ -399,11 +390,11 @@ def mwGetPageTitles(config={}, session=None):
test = getJSON(r) test = getJSON(r)
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages'] if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'): and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
titles = mwGetPageTitlesScraper(config=config, session=session) titles = getPageTitlesScraper(config=config, session=session)
else: else:
titles = mwGetPageTitlesAPI(config=config, session=session) titles = getPageTitlesAPI(config=config, session=session)
elif 'index' in config and config['index']: elif 'index' in config and config['index']:
titles = mwGetPageTitlesScraper(config=config, session=session) titles = getPageTitlesScraper(config=config, session=session)
titlesfilename = '%s-%s-titles.txt' % ( titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date']) domain2prefix(config=config), config['date'])
@ -1268,7 +1259,7 @@ def getParameters(params=[]):
'--retries', '--retries',
metavar=5, metavar=5,
default=5, default=5,
help="maximum number of retries") help="Maximum number of retries for ")
parser.add_argument('--path', help='path to store wiki dump at') parser.add_argument('--path', help='path to store wiki dump at')
parser.add_argument( parser.add_argument(
'--resume', '--resume',
@ -1276,11 +1267,11 @@ def getParameters(params=[]):
help='resumes previous incomplete dump (requires --path)') help='resumes previous incomplete dump (requires --path)')
parser.add_argument('--force', action='store_true', help='') parser.add_argument('--force', action='store_true', help='')
parser.add_argument( parser.add_argument(
'--user', help='username if authentication is required') '--user', help='Username if authentication is required.')
parser.add_argument( parser.add_argument(
'--pass', '--pass',
dest='password', dest='password',
help='password if authentication is required') help='Password if authentication is required.')
# URL params # URL params
groupWikiOrAPIOrIndex = parser.add_argument_group() groupWikiOrAPIOrIndex = parser.add_argument_group()
@ -1308,8 +1299,6 @@ def getParameters(params=[]):
help='store only the current version of pages') help='store only the current version of pages')
groupDownload.add_argument( groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump") '--images', action='store_true', help="generates an image dump")
groupDownload.add_argument(
'--pages', action='store_true', help="generates a page dump")
groupDownload.add_argument( groupDownload.add_argument(
'--namespaces', '--namespaces',
metavar="1,2,3", metavar="1,2,3",
@ -1371,79 +1360,80 @@ def getParameters(params=[]):
print 'ERROR: URLs must start with http:// or https://\n' print 'ERROR: URLs must start with http:// or https://\n'
parser.print_help() parser.print_help()
sys.exit(1) sys.exit(1)
if getWikiEngine(args.wiki) == 'wikispaces': # Get API and index and verify
pass api = args.api and args.api or ''
else: # presume is a mediawiki index = args.index and args.index or ''
# Get API and index and verify if api == '' or index == '':
api = args.api and args.api or '' if args.wiki:
index = args.index and args.index or '' if getWikiEngine(args.wiki) == 'MediaWiki':
if api == '' or index == '':
if args.wiki:
api2, index2 = mwGetAPIAndIndex(args.wiki) api2, index2 = mwGetAPIAndIndex(args.wiki)
if not api: if not api:
api = api2 api = api2
if not index: if not index:
index = index2 index = index2
else: else:
if api == '': print 'ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki'
pass sys.exit(1)
elif index == '': else:
index = '/'.join(api.split('/')[:-1]) + '/index.php' if api == '':
pass
elif index == '':
index = '/'.join(api.split('/')[:-1]) + '/index.php'
# print api # print api
# print index # print index
index2 = None index2 = None
if api: if api:
retry = 0 retry = 0
maxretries = args.retries maxretries = args.retries
retrydelay = 20 retrydelay = 20
while retry < maxretries: while retry < maxretries:
try: try:
check = checkAPI(api=api, session=session) check = checkAPI(api=api, session=session)
break break
except requests.exceptions.ConnectionError as e: except requests.exceptions.ConnectionError as e:
print 'Connection error: %s'%(str(e)) print 'Connection error: %s'%(str(e))
retry += 1 retry += 1
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay) print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
time.sleep(retrydelay) time.sleep(retrydelay)
if api and check: if api and check:
index2 = check[1] index2 = check[1]
api = check[2] api = check[2]
print 'API is OK: ' + api print 'API is OK: ' + api
else:
if index and not args.wiki:
print 'API not available. Trying with index.php only.'
else: else:
if index and not args.wiki: print 'Error in API. Please, provide a correct path to API'
print 'API not available. Trying with index.php only.' sys.exit(1)
else:
print 'Error in API. Please, provide a correct path to API'
sys.exit(1)
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
index = index2
if index and index.startswith('//'):
index = args.wiki.split('//')[0] + index
if index and checkIndex( if index and checkIndex(
index=index, index=index,
cookies=args.cookies, cookies=args.cookies,
session=session): session=session):
print 'index.php is OK' print 'index.php is OK'
else: else:
index = index2 index = '/'.join(index.split('/')[:-1])
if index and index.startswith('//'):
index = args.wiki.split('//')[0] + index
if index and checkIndex( if index and checkIndex(
index=index, index=index,
cookies=args.cookies, cookies=args.cookies,
session=session): session=session):
print 'index.php is OK' print 'index.php is OK'
else: else:
index = '/'.join(index.split('/')[:-1]) print 'Error in index.php, please, provide a correct path to index.php'
if index and checkIndex( sys.exit(1)
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
print 'Error in index.php, please, provide a correct path to index.php'
sys.exit(1)
# check user and pass (one requires both) # check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user): if (args.user and not args.password) or (args.password and not args.user):
print 'ERROR: Both --user and --pass are required for authentication.' print 'ERROR: Both --user and --pass are required for authentication.'
@ -1487,15 +1477,13 @@ def getParameters(params=[]):
sys.exit(1) sys.exit(1)
config = { config = {
'wikiengine': getWikiEngine(args.wiki),
'curonly': args.curonly, 'curonly': args.curonly,
'date': datetime.datetime.now().strftime('%Y%m%d'), 'date': datetime.datetime.now().strftime('%Y%m%d'),
'api': api, 'api': api,
'index': index, 'index': index,
'images': args.images, 'images': args.images,
'pages': args.pages,
'logs': False, 'logs': False,
'xml': args.xml, #this should be 'pages'? (and modify in all the script). Xml is mediawiki-centric, other wikis dont export in XML 'xml': args.xml,
'namespaces': namespaces, 'namespaces': namespaces,
'exnamespaces': exnamespaces, 'exnamespaces': exnamespaces,
'path': args.path and os.path.normpath(args.path) or '', 'path': args.path and os.path.normpath(args.path) or '',
@ -1651,33 +1639,10 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
def createNewDump(config={}, other={}): def createNewDump(config={}, other={}):
if config['wikiengine'] == 'mediawiki':
mwCreateNewDump(config=config, other=other)
elif config['wikiengine'] == 'wikispaces':
wsCreateNewDump(config=config, other=other)
def wsCreateNewDump(config={}, other={}):
print 'Trying generating a new dump into a new directory...'
if config['pages']:
pages = wsGetPageTitles(config=config, session=other['session'])
wsSavePageTitles(config=config, pages=pages)
generatePageDump(config=config, pages=pages, session=other['session'])
if config['images']:
images = wsGetImageNames(config=config, session=other['session'])
wsSaveImageNames(config=config, images=images)
generateImageDump(
config=config,
other=other,
images=images,
session=other['session'])
if config['logs']:
wsSaveLogs(config=config, session=other['session'])
def mwCreateNewDump(config={}, other={}):
images = [] images = []
print 'Trying generating a new dump into a new directory...' print 'Trying generating a new dump into a new directory...'
if config['xml']: if config['xml']:
mwGetPageTitles(config=config, session=other['session']) getPageTitles(config=config, session=other['session'])
titles=readTitles(config) titles=readTitles(config)
generateXMLDump(config=config, titles=titles, session=other['session']) generateXMLDump(config=config, titles=titles, session=other['session'])
checkXMLIntegrity( checkXMLIntegrity(
@ -1695,16 +1660,8 @@ def mwCreateNewDump(config={}, other={}):
if config['logs']: if config['logs']:
saveLogs(config=config, session=other['session']) saveLogs(config=config, session=other['session'])
def resumePreviousDump(config={}, other={}):
if config['wikiengine'] == 'mediawiki':
mwResumePreviousDump(config=config, other=other)
elif config['wikiengine'] == 'wikispaces':
wsResumePreviousDump(config=config, other=other)
def wsResumePreviousDump(config={}, other={}):
pass
def mwResumePreviousDump(config={}, other={}): def resumePreviousDump(config={}, other={}):
images = [] images = []
print 'Resuming previous dump process...' print 'Resuming previous dump process...'
if config['xml']: if config['xml']:
@ -1727,7 +1684,7 @@ def mwResumePreviousDump(config={}, other={}):
print 'Title list is incomplete. Reloading...' print 'Title list is incomplete. Reloading...'
# do not resume, reload, to avoid inconsistences, deleted pages or # do not resume, reload, to avoid inconsistences, deleted pages or
# so # so
mwGetPageTitles(config=config, session=other['session']) getPageTitles(config=config, session=other['session'])
# checking xml dump # checking xml dump
xmliscomplete = False xmliscomplete = False
@ -1932,88 +1889,85 @@ def avoidWikimediaProjects(config={}, other={}):
def getWikiEngine(url=''): def getWikiEngine(url=''):
""" Returns the wiki engine of a URL, if known """ """ Returns the wiki engine of a URL, if known """
wikiengine = 'unknown' session = requests.Session()
if url: session.headers.update({'User-Agent': getUserAgent()})
session = requests.Session() r = session.post(url=url)
session.headers.update({'User-Agent': getUserAgent()}) if r.status_code == 405 or r.text == '':
r = session.post(url=url) r = session.get(url=url)
if r.status_code == 405 or r.text == '': result = r.text
r = session.get(url=url)
result = r.text wikiengine = 'Unknown'
else:
return wikiengine.lower()
if re.search( if re.search(
ur'(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site', ur'(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site',
result): result):
wikiengine = 'dokuwiki' wikiengine = 'DokuWiki'
elif re.search(ur'(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki)', result): elif re.search(ur'(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki)', result):
wikiengine = 'mediawiki' wikiengine = 'MediaWiki'
elif re.search(ur'(?im)(>MoinMoin Powered</a>|<option value="LocalSiteMap">)', result): elif re.search(ur'(?im)(>MoinMoin Powered</a>|<option value="LocalSiteMap">)', result):
wikiengine = 'moinmoin' wikiengine = 'MoinMoin'
elif re.search(ur'(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)', result): elif re.search(ur'(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)', result):
wikiengine = 'twiki' wikiengine = 'TWiki'
elif re.search(ur'(?im)(<!--PageHeaderFmt-->)', result): elif re.search(ur'(?im)(<!--PageHeaderFmt-->)', result):
wikiengine = 'pmwiki' wikiengine = 'PmWiki'
elif re.search(ur'(?im)(<meta name="generator" content="PhpWiki|<meta name="PHPWIKI_VERSION)', result): elif re.search(ur'(?im)(<meta name="generator" content="PhpWiki|<meta name="PHPWIKI_VERSION)', result):
wikiengine = 'phpwiki' wikiengine = 'PhpWiki'
elif re.search(ur'(?im)(<meta name="generator" content="Tiki Wiki|Powered by <a href="http://(www\.)?tiki\.org"| id="tiki-(top|main)")', result): elif re.search(ur'(?im)(<meta name="generator" content="Tiki Wiki|Powered by <a href="http://(www\.)?tiki\.org"| id="tiki-(top|main)")', result):
wikiengine = 'tikiwiki' wikiengine = 'TikiWiki'
elif re.search(ur'(?im)(foswikiNoJs|<meta name="foswiki\.|foswikiTable|foswikiContentFooter)', result): elif re.search(ur'(?im)(foswikiNoJs|<meta name="foswiki\.|foswikiTable|foswikiContentFooter)', result):
wikiengine = 'foswiki' wikiengine = 'FosWiki'
elif re.search(ur'(?im)(<meta http-equiv="powered by" content="MojoMojo)', result): elif re.search(ur'(?im)(<meta http-equiv="powered by" content="MojoMojo)', result):
wikiengine = 'mojomojo' wikiengine = 'MojoMojo'
elif re.search(ur'(?im)(id="xwiki(content|nav_footer|platformversion|docinfo|maincontainer|data)|/resources/js/xwiki/xwiki|XWiki\.webapppath)', result): elif re.search(ur'(?im)(id="xwiki(content|nav_footer|platformversion|docinfo|maincontainer|data)|/resources/js/xwiki/xwiki|XWiki\.webapppath)', result):
wikiengine = 'xwiki' wikiengine = 'XWiki'
elif re.search(ur'(?im)(<meta id="confluence-(base-url|context-path)")', result): elif re.search(ur'(?im)(<meta id="confluence-(base-url|context-path)")', result):
wikiengine = 'confluence' wikiengine = 'Confluence'
elif re.search(ur'(?im)(<meta name="generator" content="Banana Dance)', result): elif re.search(ur'(?im)(<meta name="generator" content="Banana Dance)', result):
wikiengine = 'bananadance' wikiengine = 'Banana Dance'
elif re.search(ur'(?im)(Wheeled by <a class="external-link" href="http://www\.wagn\.org">|<body id="wagn">)', result): elif re.search(ur'(?im)(Wheeled by <a class="external-link" href="http://www\.wagn\.org">|<body id="wagn">)', result):
wikiengine = 'wagn' wikiengine = 'Wagn'
elif re.search(ur'(?im)(<meta name="generator" content="MindTouch)', result): elif re.search(ur'(?im)(<meta name="generator" content="MindTouch)', result):
wikiengine = 'mindtouch' # formerly DekiWiki wikiengine = 'MindTouch' # formerly DekiWiki
elif re.search(ur'(?im)(<div class="wikiversion">\s*(<p>)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result): elif re.search(ur'(?im)(<div class="wikiversion">\s*(<p>)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result):
wikiengine = 'jspwiki' wikiengine = 'JSPWiki'
elif re.search(ur'(?im)(Powered by:?\s*(<br ?/>)?\s*<a href="http://kwiki\.org">|\bKwikiNavigation\b)', result): elif re.search(ur'(?im)(Powered by:?\s*(<br ?/>)?\s*<a href="http://kwiki\.org">|\bKwikiNavigation\b)', result):
wikiengine = 'kwiki' wikiengine = 'Kwiki'
elif re.search(ur'(?im)(Powered by <a href="http://www\.anwiki\.com")', result): elif re.search(ur'(?im)(Powered by <a href="http://www\.anwiki\.com")', result):
wikiengine = 'anwiki' wikiengine = 'Anwiki'
elif re.search(ur'(?im)(<meta name="generator" content="Aneuch|is powered by <em>Aneuch</em>|<!-- start of Aneuch markup -->)', result): elif re.search(ur'(?im)(<meta name="generator" content="Aneuch|is powered by <em>Aneuch</em>|<!-- start of Aneuch markup -->)', result):
wikiengine = 'aneuch' wikiengine = 'Aneuch'
elif re.search(ur'(?im)(<meta name="generator" content="bitweaver)', result): elif re.search(ur'(?im)(<meta name="generator" content="bitweaver)', result):
wikiengine = 'bitweaver' wikiengine = 'bitweaver'
elif re.search(ur'(?im)(powered by <a href="[^"]*\bzwiki.org(/[^"]*)?">)', result): elif re.search(ur'(?im)(powered by <a href="[^"]*\bzwiki.org(/[^"]*)?">)', result):
wikiengine = 'zwiki' wikiengine = 'Zwiki'
# WakkaWiki forks # WakkaWiki forks
elif re.search(ur'(?im)(<meta name="generator" content="WikkaWiki|<a class="ext" href="(http://wikka\.jsnx\.com/|http://wikkawiki\.org/)">)', result): elif re.search(ur'(?im)(<meta name="generator" content="WikkaWiki|<a class="ext" href="(http://wikka\.jsnx\.com/|http://wikkawiki\.org/)">)', result):
wikiengine = 'wikkawiki' # formerly WikkaWakkaWiki wikiengine = 'WikkaWiki' # formerly WikkaWakkaWiki
elif re.search(ur'(?im)(<meta name="generator" content="CoMa Wiki)', result): elif re.search(ur'(?im)(<meta name="generator" content="CoMa Wiki)', result):
wikiengine = 'comawiki' wikiengine = 'CoMaWiki'
elif re.search(ur'(?im)(Fonctionne avec <a href="http://www\.wikini\.net)', result): elif re.search(ur'(?im)(Fonctionne avec <a href="http://www\.wikini\.net)', result):
wikiengine = 'wikini' wikiengine = 'WikiNi'
elif re.search(ur'(?im)(Powered by <a href="[^"]*CitiWiki">CitiWiki</a>)', result): elif re.search(ur'(?im)(Powered by <a href="[^"]*CitiWiki">CitiWiki</a>)', result):
wikiengine = 'citiwiki' wikiengine = 'CitiWiki'
elif re.search(ur'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result): elif re.search(ur'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result):
wikiengine = 'wackowiki' wikiengine = 'WackoWiki'
elif re.search(ur'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result): elif re.search(ur'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result):
# This may not work for heavily modded/themed installations, e.g. # This may not work for heavily modded/themed installations, e.g.
# http://operawiki.info/ # http://operawiki.info/
wikiengine = 'wakkawiki' wikiengine = 'WakkaWiki'
# Custom wikis used by wiki farms # Custom wikis used by wiki farms
elif re.search(ur'(?im)(var wikispaces_page|<div class="WikispacesContent)', result): elif re.search(ur'(?im)(var wikispaces_page|<div class="WikispacesContent)', result):
wikiengine = 'wikispaces' wikiengine = 'Wikispaces'
elif re.search(ur'(?im)(Powered by <a href="http://www\.wikidot\.com">|wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', result): elif re.search(ur'(?im)(Powered by <a href="http://www\.wikidot\.com">|wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', result):
wikiengine = 'wikidot' wikiengine = 'Wikidot'
elif re.search(ur'(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)', result): elif re.search(ur'(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)', result):
wikiengine = 'wetpaint' wikiengine = 'Wetpaint'
elif re.search(ur'(?im)(<div id="footer-pbwiki">|ws-nav-search|PBinfo *= *{)', result): elif re.search(ur'(?im)(<div id="footer-pbwiki">|ws-nav-search|PBinfo *= *{)', result):
# formerly PBwiki # formerly PBwiki
wikiengine = 'pbworks' wikiengine = 'PBworks'
# if wikiengine == 'Unknown': print result # if wikiengine == 'Unknown': print result
return wikiengine.lower() return wikiengine
def mwGetAPIAndIndex(url=''): def mwGetAPIAndIndex(url=''):
@ -2075,7 +2029,7 @@ def main(params=[]):
avoidWikimediaProjects(config=config, other=other) avoidWikimediaProjects(config=config, other=other)
print welcome() print welcome()
print 'Analysing %s' % (config['api'] or config['index']) print 'Analysing %s' % (config['api'] and config['api'] or config['index'])
# creating path or resuming if desired # creating path or resuming if desired
c = 2 c = 2
@ -2124,4 +2078,3 @@ def main(params=[]):
if __name__ == "__main__": if __name__ == "__main__":
main() main()

Loading…
Cancel
Save