2
0
mirror of https://github.com/WikiTeam/wikiteam synced 2024-11-10 13:10:27 +00:00

adding wiki engine detector

This commit is contained in:
Emilio J. Rodríguez-Posada 2014-07-03 18:17:02 +02:00
parent eb97cf1adf
commit 9553e3550c
2 changed files with 52 additions and 10 deletions

View File

@ -881,13 +881,15 @@ def getParameters(params=[]):
parser = argparse.ArgumentParser(description='')
parser.add_argument('-v', '--version', action='version', version=(params[0] + " version " + getVersion()))
parser.add_argument('-v', '--version', action='version', version=getVersion())
parser.add_argument('--cookies', metavar="cookies.txt", help="path to a cookies.txt file")
parser.add_argument('--delay', metavar=5, default=0, help="adds a delay (in seconds)")
parser.add_argument('--get-wiki-engine', action='store_true', help="returns the wiki engine")
groupAPIOrIndex = parser.add_mutually_exclusive_group(required=True)
groupAPIOrIndex.add_argument('--api', help="URL to api.php")
groupAPIOrIndex.add_argument('--index', help="URL to index.php")
groupWikiOrAPIOrIndex = parser.add_mutually_exclusive_group(required=True)
groupWikiOrAPIOrIndex.add_argument('wiki', default='', nargs='?', help="URL to wiki")
groupWikiOrAPIOrIndex.add_argument('--api', help="URL to api.php")
groupWikiOrAPIOrIndex.add_argument('--index', help="URL to index.php")
groupXMLOrImages = parser.add_argument_group()
groupXMLOrImages.add_argument('--xml', action='store_true', help="generates a full history XML dump (--xml --curonly for current revisions only)")
@ -902,16 +904,24 @@ def getParameters(params=[]):
parser.add_argument('--exnamespaces', metavar="1,2,3", help='comma-separated value of namespaces to exclude')
args = parser.parse_args()
#print args
# Execute excluding args
if args.get_wiki_engine and args.wiki and (args.wiki.startswith('http://') or args.wiki.startswith('https://')):
print getWikiEngine(url=args.wiki)
sys.exit()
# End execute excluding args
# check API URL
if args.api and (not args.api.startswith('http://') and not args.api.startswith('https://')):
print 'api.php must start with http:// or https://\n'
print args.api
print 'ERROR: URL to api.php must start with http:// or https://\n'
parser.print_usage()
sys.exit(1)
# check index URL
if args.index and (not args.index.startswith('http://') and not args.index.startswith('https://')):
print 'index.php must start with http:// or https://\n'
print 'ERROR: URL to index.php must start with http:// or https://\n'
parser.print_usage()
sys.exit(1)
@ -999,7 +1009,6 @@ def getParameters(params=[]):
print 'Error in index.php, please, provide a correct path to index.php'
sys.exit()
#calculating path, if not defined by user with --path=
if not config['path']:
config['path'] = './%s-%s-wikidump' % (domain2prefix(config=config), config['date'])
@ -1290,13 +1299,35 @@ def avoidWikimediaProjects(config={}, other={}):
print 'Thanks!'
sys.exit()
def getWikiEngine(url=''):
""" Returns the wiki engine of a URL, if known """
req = urllib2.Request(url=url, headers={'User-Agent': getUserAgent(), 'Accept-Encoding': 'gzip'})
f = urllib2.urlopen(req)
if f.headers.get('Content-Encoding') and 'gzip' in f.headers.get('Content-Encoding'):
raw = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
raw = f.read()
f.close()
wikiengine = 'Unknown'
if re.search(ur'(?im)(<meta name="generator" content="DokuWiki)', raw):
wikiengine = 'DokuWiki'
elif re.search(ur'(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki)', raw):
wikiengine = 'MediaWiki'
elif re.search(ur'(?im)(>MoinMoin Powered</a>)', raw):
wikiengine = 'MoinMoin'
return wikiengine
def main(params=[]):
""" Main function """
print welcome()
configfilename = 'config.txt'
config, other = getParameters(params=params)
avoidWikimediaProjects(config=config, other=other)
print welcome()
print 'Analysing %s' % (config['api'] and config['api'] or config['index'])
#creating path or resuming if desired

View File

@ -22,7 +22,7 @@ import time
import unittest
import urllib
import urllib2
from dumpgenerator import delay, getImageFilenamesURL, getImageFilenamesURLAPI, getUserAgent
from dumpgenerator import delay, getImageFilenamesURL, getImageFilenamesURLAPI, getUserAgent, getWikiEngine
class TestDumpgenerator(unittest.TestCase):
#Documentation
@ -73,8 +73,19 @@ class TestDumpgenerator(unittest.TestCase):
self.assertTrue(len(result_index) == imagecount)
self.assertTrue(filetocheck in [filename for filename, url, uploader in result_index])
def test_getWikiEngine(self):
tests = [
['https://www.dokuwiki.org', 'DokuWiki'],
['http://wiki.openwrt.org', 'DokuWiki'],
['http://moinmo.in', 'MoinMoin'],
['https://wiki.debian.org', 'MoinMoin'],
]
for wiki, engine in tests:
print 'Testing', wiki
self.assertTrue(getWikiEngine(wiki) == engine)
if __name__ == '__main__':
#copying dumpgenerator.py to this directory
shutil.copy2('../dumpgenerator.py', './dumpgenerator.py')
#shutil.copy2('../dumpgenerator.py', './dumpgenerator.py')
unittest.main()