restoring dumpgenerator.py code to f43b7389a0 last stable version. I will rewrite code in wikiteam/ subdirectory

pull/287/head
emijrp 8 years ago
parent 2783e1cecb
commit 3f697dbb5b

@ -223,7 +223,7 @@ def getNamespacesAPI(config={}, session=None):
return namespaces, namespacenames
def mwGetPageTitlesAPI(config={}, session=None):
def getPageTitlesAPI(config={}, session=None):
""" Uses the API to get the list of page titles """
titles = []
namespaces, namespacenames = getNamespacesAPI(
@ -292,7 +292,7 @@ def mwGetPageTitlesAPI(config={}, session=None):
delay(config=config, session=session)
print ' %d titles retrieved in the namespace %d' % (c, namespace)
def mwGetPageTitlesScraper(config={}, session=None):
def getPageTitlesScraper(config={}, session=None):
""" Scrape the list of page titles from Special:Allpages """
titles = []
namespaces, namespacenames = getNamespacesScraper(
@ -376,16 +376,7 @@ def mwGetPageTitlesScraper(config={}, session=None):
return titles
def wsGetPageTitles(config={}, session=None):
""" Get list of page titles """
titles = []
return titles
def mwGetPageTitles(config={}, session=None):
def getPageTitles(config={}, session=None):
""" Get list of page titles """
# http://en.wikipedia.org/wiki/Special:AllPages
# http://archiveteam.org/index.php?title=Special:AllPages
@ -399,11 +390,11 @@ def mwGetPageTitles(config={}, session=None):
test = getJSON(r)
if ('warnings' in test and 'allpages' in test['warnings'] and '*' in test['warnings']['allpages']
and test['warnings']['allpages']['*'] == 'The "allpages" module has been disabled.'):
titles = mwGetPageTitlesScraper(config=config, session=session)
titles = getPageTitlesScraper(config=config, session=session)
else:
titles = mwGetPageTitlesAPI(config=config, session=session)
titles = getPageTitlesAPI(config=config, session=session)
elif 'index' in config and config['index']:
titles = mwGetPageTitlesScraper(config=config, session=session)
titles = getPageTitlesScraper(config=config, session=session)
titlesfilename = '%s-%s-titles.txt' % (
domain2prefix(config=config), config['date'])
@ -1268,7 +1259,7 @@ def getParameters(params=[]):
'--retries',
metavar=5,
default=5,
help="maximum number of retries")
help="Maximum number of retries for ")
parser.add_argument('--path', help='path to store wiki dump at')
parser.add_argument(
'--resume',
@ -1276,11 +1267,11 @@ def getParameters(params=[]):
help='resumes previous incomplete dump (requires --path)')
parser.add_argument('--force', action='store_true', help='')
parser.add_argument(
'--user', help='username if authentication is required')
'--user', help='Username if authentication is required.')
parser.add_argument(
'--pass',
dest='password',
help='password if authentication is required')
help='Password if authentication is required.')
# URL params
groupWikiOrAPIOrIndex = parser.add_argument_group()
@ -1308,8 +1299,6 @@ def getParameters(params=[]):
help='store only the current version of pages')
groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump")
groupDownload.add_argument(
'--pages', action='store_true', help="generates a page dump")
groupDownload.add_argument(
'--namespaces',
metavar="1,2,3",
@ -1371,79 +1360,80 @@ def getParameters(params=[]):
print 'ERROR: URLs must start with http:// or https://\n'
parser.print_help()
sys.exit(1)
if getWikiEngine(args.wiki) == 'wikispaces':
pass
else: # presume is a mediawiki
# Get API and index and verify
api = args.api and args.api or ''
index = args.index and args.index or ''
if api == '' or index == '':
if args.wiki:
# Get API and index and verify
api = args.api and args.api or ''
index = args.index and args.index or ''
if api == '' or index == '':
if args.wiki:
if getWikiEngine(args.wiki) == 'MediaWiki':
api2, index2 = mwGetAPIAndIndex(args.wiki)
if not api:
api = api2
if not index:
index = index2
else:
if api == '':
pass
elif index == '':
index = '/'.join(api.split('/')[:-1]) + '/index.php'
print 'ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki'
sys.exit(1)
else:
if api == '':
pass
elif index == '':
index = '/'.join(api.split('/')[:-1]) + '/index.php'
# print api
# print index
index2 = None
# print api
# print index
index2 = None
if api:
retry = 0
maxretries = args.retries
retrydelay = 20
while retry < maxretries:
try:
check = checkAPI(api=api, session=session)
break
except requests.exceptions.ConnectionError as e:
print 'Connection error: %s'%(str(e))
retry += 1
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
time.sleep(retrydelay)
if api and check:
index2 = check[1]
api = check[2]
print 'API is OK: ' + api
if api:
retry = 0
maxretries = args.retries
retrydelay = 20
while retry < maxretries:
try:
check = checkAPI(api=api, session=session)
break
except requests.exceptions.ConnectionError as e:
print 'Connection error: %s'%(str(e))
retry += 1
print "Start retry attempt %d in %d seconds."%(retry+1, retrydelay)
time.sleep(retrydelay)
if api and check:
index2 = check[1]
api = check[2]
print 'API is OK: ' + api
else:
if index and not args.wiki:
print 'API not available. Trying with index.php only.'
else:
if index and not args.wiki:
print 'API not available. Trying with index.php only.'
else:
print 'Error in API. Please, provide a correct path to API'
sys.exit(1)
print 'Error in API. Please, provide a correct path to API'
sys.exit(1)
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
index = index2
if index and index.startswith('//'):
index = args.wiki.split('//')[0] + index
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
index = index2
if index and index.startswith('//'):
index = args.wiki.split('//')[0] + index
index = '/'.join(index.split('/')[:-1])
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
index = '/'.join(index.split('/')[:-1])
if index and checkIndex(
index=index,
cookies=args.cookies,
session=session):
print 'index.php is OK'
else:
print 'Error in index.php, please, provide a correct path to index.php'
sys.exit(1)
print 'Error in index.php, please, provide a correct path to index.php'
sys.exit(1)
# check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user):
print 'ERROR: Both --user and --pass are required for authentication.'
@ -1487,15 +1477,13 @@ def getParameters(params=[]):
sys.exit(1)
config = {
'wikiengine': getWikiEngine(args.wiki),
'curonly': args.curonly,
'date': datetime.datetime.now().strftime('%Y%m%d'),
'api': api,
'index': index,
'images': args.images,
'pages': args.pages,
'logs': False,
'xml': args.xml, #this should be 'pages'? (and modify in all the script). Xml is mediawiki-centric, other wikis dont export in XML
'xml': args.xml,
'namespaces': namespaces,
'exnamespaces': exnamespaces,
'path': args.path and os.path.normpath(args.path) or '',
@ -1651,33 +1639,10 @@ def checkXMLIntegrity(config={}, titles=[], session=None):
def createNewDump(config={}, other={}):
if config['wikiengine'] == 'mediawiki':
mwCreateNewDump(config=config, other=other)
elif config['wikiengine'] == 'wikispaces':
wsCreateNewDump(config=config, other=other)
def wsCreateNewDump(config={}, other={}):
print 'Trying generating a new dump into a new directory...'
if config['pages']:
pages = wsGetPageTitles(config=config, session=other['session'])
wsSavePageTitles(config=config, pages=pages)
generatePageDump(config=config, pages=pages, session=other['session'])
if config['images']:
images = wsGetImageNames(config=config, session=other['session'])
wsSaveImageNames(config=config, images=images)
generateImageDump(
config=config,
other=other,
images=images,
session=other['session'])
if config['logs']:
wsSaveLogs(config=config, session=other['session'])
def mwCreateNewDump(config={}, other={}):
images = []
print 'Trying generating a new dump into a new directory...'
if config['xml']:
mwGetPageTitles(config=config, session=other['session'])
getPageTitles(config=config, session=other['session'])
titles=readTitles(config)
generateXMLDump(config=config, titles=titles, session=other['session'])
checkXMLIntegrity(
@ -1695,16 +1660,8 @@ def mwCreateNewDump(config={}, other={}):
if config['logs']:
saveLogs(config=config, session=other['session'])
def resumePreviousDump(config={}, other={}):
if config['wikiengine'] == 'mediawiki':
mwResumePreviousDump(config=config, other=other)
elif config['wikiengine'] == 'wikispaces':
wsResumePreviousDump(config=config, other=other)
def wsResumePreviousDump(config={}, other={}):
pass
def mwResumePreviousDump(config={}, other={}):
def resumePreviousDump(config={}, other={}):
images = []
print 'Resuming previous dump process...'
if config['xml']:
@ -1727,7 +1684,7 @@ def mwResumePreviousDump(config={}, other={}):
print 'Title list is incomplete. Reloading...'
# do not resume, reload, to avoid inconsistences, deleted pages or
# so
mwGetPageTitles(config=config, session=other['session'])
getPageTitles(config=config, session=other['session'])
# checking xml dump
xmliscomplete = False
@ -1932,88 +1889,85 @@ def avoidWikimediaProjects(config={}, other={}):
def getWikiEngine(url=''):
""" Returns the wiki engine of a URL, if known """
wikiengine = 'unknown'
if url:
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url)
if r.status_code == 405 or r.text == '':
r = session.get(url=url)
result = r.text
else:
return wikiengine.lower()
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url)
if r.status_code == 405 or r.text == '':
r = session.get(url=url)
result = r.text
wikiengine = 'Unknown'
if re.search(
ur'(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site',
result):
wikiengine = 'dokuwiki'
wikiengine = 'DokuWiki'
elif re.search(ur'(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki)', result):
wikiengine = 'mediawiki'
wikiengine = 'MediaWiki'
elif re.search(ur'(?im)(>MoinMoin Powered</a>|<option value="LocalSiteMap">)', result):
wikiengine = 'moinmoin'
wikiengine = 'MoinMoin'
elif re.search(ur'(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)', result):
wikiengine = 'twiki'
wikiengine = 'TWiki'
elif re.search(ur'(?im)(<!--PageHeaderFmt-->)', result):
wikiengine = 'pmwiki'
wikiengine = 'PmWiki'
elif re.search(ur'(?im)(<meta name="generator" content="PhpWiki|<meta name="PHPWIKI_VERSION)', result):
wikiengine = 'phpwiki'
wikiengine = 'PhpWiki'
elif re.search(ur'(?im)(<meta name="generator" content="Tiki Wiki|Powered by <a href="http://(www\.)?tiki\.org"| id="tiki-(top|main)")', result):
wikiengine = 'tikiwiki'
wikiengine = 'TikiWiki'
elif re.search(ur'(?im)(foswikiNoJs|<meta name="foswiki\.|foswikiTable|foswikiContentFooter)', result):
wikiengine = 'foswiki'
wikiengine = 'FosWiki'
elif re.search(ur'(?im)(<meta http-equiv="powered by" content="MojoMojo)', result):
wikiengine = 'mojomojo'
wikiengine = 'MojoMojo'
elif re.search(ur'(?im)(id="xwiki(content|nav_footer|platformversion|docinfo|maincontainer|data)|/resources/js/xwiki/xwiki|XWiki\.webapppath)', result):
wikiengine = 'xwiki'
wikiengine = 'XWiki'
elif re.search(ur'(?im)(<meta id="confluence-(base-url|context-path)")', result):
wikiengine = 'confluence'
wikiengine = 'Confluence'
elif re.search(ur'(?im)(<meta name="generator" content="Banana Dance)', result):
wikiengine = 'bananadance'
wikiengine = 'Banana Dance'
elif re.search(ur'(?im)(Wheeled by <a class="external-link" href="http://www\.wagn\.org">|<body id="wagn">)', result):
wikiengine = 'wagn'
wikiengine = 'Wagn'
elif re.search(ur'(?im)(<meta name="generator" content="MindTouch)', result):
wikiengine = 'mindtouch' # formerly DekiWiki
wikiengine = 'MindTouch' # formerly DekiWiki
elif re.search(ur'(?im)(<div class="wikiversion">\s*(<p>)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")', result):
wikiengine = 'jspwiki'
wikiengine = 'JSPWiki'
elif re.search(ur'(?im)(Powered by:?\s*(<br ?/>)?\s*<a href="http://kwiki\.org">|\bKwikiNavigation\b)', result):
wikiengine = 'kwiki'
wikiengine = 'Kwiki'
elif re.search(ur'(?im)(Powered by <a href="http://www\.anwiki\.com")', result):
wikiengine = 'anwiki'
wikiengine = 'Anwiki'
elif re.search(ur'(?im)(<meta name="generator" content="Aneuch|is powered by <em>Aneuch</em>|<!-- start of Aneuch markup -->)', result):
wikiengine = 'aneuch'
wikiengine = 'Aneuch'
elif re.search(ur'(?im)(<meta name="generator" content="bitweaver)', result):
wikiengine = 'bitweaver'
elif re.search(ur'(?im)(powered by <a href="[^"]*\bzwiki.org(/[^"]*)?">)', result):
wikiengine = 'zwiki'
wikiengine = 'Zwiki'
# WakkaWiki forks
elif re.search(ur'(?im)(<meta name="generator" content="WikkaWiki|<a class="ext" href="(http://wikka\.jsnx\.com/|http://wikkawiki\.org/)">)', result):
wikiengine = 'wikkawiki' # formerly WikkaWakkaWiki
wikiengine = 'WikkaWiki' # formerly WikkaWakkaWiki
elif re.search(ur'(?im)(<meta name="generator" content="CoMa Wiki)', result):
wikiengine = 'comawiki'
wikiengine = 'CoMaWiki'
elif re.search(ur'(?im)(Fonctionne avec <a href="http://www\.wikini\.net)', result):
wikiengine = 'wikini'
wikiengine = 'WikiNi'
elif re.search(ur'(?im)(Powered by <a href="[^"]*CitiWiki">CitiWiki</a>)', result):
wikiengine = 'citiwiki'
wikiengine = 'CitiWiki'
elif re.search(ur'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result):
wikiengine = 'wackowiki'
wikiengine = 'WackoWiki'
elif re.search(ur'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result):
# This may not work for heavily modded/themed installations, e.g.
# http://operawiki.info/
wikiengine = 'wakkawiki'
wikiengine = 'WakkaWiki'
# Custom wikis used by wiki farms
elif re.search(ur'(?im)(var wikispaces_page|<div class="WikispacesContent)', result):
wikiengine = 'wikispaces'
wikiengine = 'Wikispaces'
elif re.search(ur'(?im)(Powered by <a href="http://www\.wikidot\.com">|wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)', result):
wikiengine = 'wikidot'
wikiengine = 'Wikidot'
elif re.search(ur'(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)', result):
wikiengine = 'wetpaint'
wikiengine = 'Wetpaint'
elif re.search(ur'(?im)(<div id="footer-pbwiki">|ws-nav-search|PBinfo *= *{)', result):
# formerly PBwiki
wikiengine = 'pbworks'
wikiengine = 'PBworks'
# if wikiengine == 'Unknown': print result
return wikiengine.lower()
return wikiengine
def mwGetAPIAndIndex(url=''):
@ -2075,7 +2029,7 @@ def main(params=[]):
avoidWikimediaProjects(config=config, other=other)
print welcome()
print 'Analysing %s' % (config['api'] or config['index'])
print 'Analysing %s' % (config['api'] and config['api'] or config['index'])
# creating path or resuming if desired
c = 2
@ -2124,4 +2078,3 @@ def main(params=[]):
if __name__ == "__main__":
main()

Loading…
Cancel
Save