|
|
|
@ -23,10 +23,21 @@ try:
|
|
|
|
|
from kitchen.text.converters import getwriter, to_unicode
|
|
|
|
|
except ImportError:
|
|
|
|
|
print "Please install the kitchen module."
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import xml.etree.cElementTree as ET
|
|
|
|
|
except ImportError:
|
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
|
|
|
|
import xml.dom.minidom as MD
|
|
|
|
|
|
|
|
|
|
import cookielib
|
|
|
|
|
import cPickle
|
|
|
|
|
import datetime
|
|
|
|
|
import sys
|
|
|
|
|
import io
|
|
|
|
|
import traceback
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import argparse
|
|
|
|
|
except ImportError:
|
|
|
|
@ -63,7 +74,7 @@ except ImportError:
|
|
|
|
|
UTF8Writer = getwriter('utf8')
|
|
|
|
|
sys.stdout = UTF8Writer(sys.stdout)
|
|
|
|
|
|
|
|
|
|
__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org
|
|
|
|
|
__VERSION__ = '0.4.1-alpha' # major, minor, micro: semver.org
|
|
|
|
|
|
|
|
|
|
class PageMissingError(Exception):
|
|
|
|
|
def __init__(self, title, xml):
|
|
|
|
@ -164,7 +175,7 @@ def getNamespacesScraper(config={}, session=None):
|
|
|
|
|
namespacenames = {0: ''} # main is 0, no prefix
|
|
|
|
|
if namespaces:
|
|
|
|
|
r = session.post(
|
|
|
|
|
url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
|
|
|
|
|
url=config['index'], params={'title': 'Special:Allpages'}, timeout=60)
|
|
|
|
|
raw = r.text
|
|
|
|
|
delay(config=config, session=session)
|
|
|
|
|
|
|
|
|
@ -206,7 +217,7 @@ def getNamespacesAPI(config={}, session=None):
|
|
|
|
|
'meta': 'siteinfo',
|
|
|
|
|
'siprop': 'namespaces',
|
|
|
|
|
'format': 'json'},
|
|
|
|
|
timeout=30
|
|
|
|
|
timeout=60
|
|
|
|
|
)
|
|
|
|
|
result = getJSON(r)
|
|
|
|
|
delay(config=config, session=session)
|
|
|
|
@ -281,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None):
|
|
|
|
|
print ' Retrieving titles in the namespace', namespace
|
|
|
|
|
url = '%s?title=Special:Allpages&namespace=%s' % (
|
|
|
|
|
config['index'], namespace)
|
|
|
|
|
r = session.get(url=url, timeout=30)
|
|
|
|
|
r = session.get(url=url, timeout=60)
|
|
|
|
|
raw = r.text
|
|
|
|
|
raw = cleanHTML(raw)
|
|
|
|
|
|
|
|
|
@ -455,7 +466,7 @@ def getXMLHeader(config={}, session=None):
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
try:
|
|
|
|
|
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
|
|
|
|
|
xml = "".join([x for x in selectXMLQueryMode(config=config, title=randomtitle, verbose=False, session=session)])
|
|
|
|
|
except PageMissingError as pme:
|
|
|
|
|
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
|
|
|
|
|
xml = pme.xml
|
|
|
|
@ -473,11 +484,11 @@ def getXMLHeader(config={}, session=None):
|
|
|
|
|
'meta': 'siteinfo',
|
|
|
|
|
'siprop': 'namespaces',
|
|
|
|
|
'format': 'json'},
|
|
|
|
|
timeout=120
|
|
|
|
|
timeout=60
|
|
|
|
|
)
|
|
|
|
|
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
|
|
|
|
|
+ ':Export'
|
|
|
|
|
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
|
|
|
|
|
xml = "".join([x for x in selectXMLQueryMode(config=config, title=randomtitle, verbose=False, session=session)])
|
|
|
|
|
except PageMissingError as pme:
|
|
|
|
|
xml = pme.xml
|
|
|
|
|
except ExportAbortedError:
|
|
|
|
@ -500,7 +511,7 @@ def getXMLHeader(config={}, session=None):
|
|
|
|
|
def getXMLFileDesc(config={}, title='', session=None):
|
|
|
|
|
""" Get XML for image description page """
|
|
|
|
|
config['curonly'] = 1 # tricky to get only the most recent desc
|
|
|
|
|
return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
|
|
|
|
|
return("".join([x for x in selectXMLQueryMode( config=config, title=title, verbose=False, session=session)]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getUserAgent():
|
|
|
|
@ -521,7 +532,216 @@ def logerror(config={}, text=''):
|
|
|
|
|
output = u'%s: %s\n' % (
|
|
|
|
|
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
|
|
|
|
|
outfile.write(output.encode('utf-8'))
|
|
|
|
|
def reconstructRevisions(root=None):
|
|
|
|
|
#print ET.tostring(rev)
|
|
|
|
|
page = ET.Element('stub')
|
|
|
|
|
edits = 0
|
|
|
|
|
for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
|
|
|
|
|
try:
|
|
|
|
|
rev_ = ET.SubElement(page,'revision')
|
|
|
|
|
ET.SubElement(rev_,'id').text = rev.attrib['revid']
|
|
|
|
|
ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
|
|
|
|
|
contributor = ET.SubElement(rev_,'contributor')
|
|
|
|
|
if not rev.attrib.has_key('userhidden'):
|
|
|
|
|
ET.SubElement(contributor,'username').text = rev.attrib['user']
|
|
|
|
|
ET.SubElement(contributor,'id').text = rev.attrib['userid']
|
|
|
|
|
else:
|
|
|
|
|
contributor.set('deleted','deleted')
|
|
|
|
|
comment = ET.SubElement(rev_,'comment')
|
|
|
|
|
if not rev.attrib.has_key('commenthidden'):
|
|
|
|
|
comment.text = rev.attrib['comment']
|
|
|
|
|
else:
|
|
|
|
|
comment.set('deleted','deleted')
|
|
|
|
|
|
|
|
|
|
# some revision does not return model and format, so just use hard-code
|
|
|
|
|
ET.SubElement(rev_,'model').text = 'wikitext'
|
|
|
|
|
ET.SubElement(rev_,'format').text = 'text/x-wiki'
|
|
|
|
|
text = ET.SubElement(rev_,'text')
|
|
|
|
|
if not rev.attrib.has_key('texthidden'):
|
|
|
|
|
text.attrib['xml:space'] = "preserve"
|
|
|
|
|
text.attrib['bytes'] = rev.attrib['size']
|
|
|
|
|
text.text = rev.text
|
|
|
|
|
else:
|
|
|
|
|
text.set('deleted','deleted')
|
|
|
|
|
# delete sha1 here :)
|
|
|
|
|
#sha1 = ET.SubElement(rev_,'sha1')
|
|
|
|
|
#if not rev.attrib.has_key('sha1missing'):
|
|
|
|
|
#sha1.text = rev.attrib['sha1']
|
|
|
|
|
if rev.attrib.has_key('minor'):
|
|
|
|
|
ET.SubElement(rev_,'minor')
|
|
|
|
|
edits += 1
|
|
|
|
|
except Exception as e:
|
|
|
|
|
#logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
|
|
|
|
|
print ET.tostring(rev)
|
|
|
|
|
traceback.print_exc()
|
|
|
|
|
page = None
|
|
|
|
|
edits = 0
|
|
|
|
|
raise e
|
|
|
|
|
return page,edits
|
|
|
|
|
|
|
|
|
|
def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
|
|
|
|
|
""" """
|
|
|
|
|
# just send the API request
|
|
|
|
|
# if it fails, it will reduce params['rvlimit']
|
|
|
|
|
xml = ''
|
|
|
|
|
c = 0
|
|
|
|
|
maxseconds = 100 # max seconds to wait in a single sleeping
|
|
|
|
|
maxretries = config['retries'] # x retries and skip
|
|
|
|
|
increment = 20 # increment every retry
|
|
|
|
|
|
|
|
|
|
while not re.search(r'</api>' if not config['curonly'] else r'</mediawiki>', xml) or re.search(r'</error>', xml):
|
|
|
|
|
if c > 0 and c < maxretries:
|
|
|
|
|
wait = increment * c < maxseconds and increment * \
|
|
|
|
|
c or maxseconds # incremental until maxseconds
|
|
|
|
|
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiquery'] else 'pages'], wait)
|
|
|
|
|
time.sleep(wait)
|
|
|
|
|
# reducing server load requesting smallest chunks (if curonly then
|
|
|
|
|
# rvlimit = 1 from mother function)
|
|
|
|
|
if params['rvlimit'] > 1:
|
|
|
|
|
params['rvlimit'] = params['rvlimit'] / 2 # half
|
|
|
|
|
if c >= maxretries:
|
|
|
|
|
print ' We have retried %d times' % (c)
|
|
|
|
|
print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiquery'] else 'pages'])
|
|
|
|
|
# If it's not already what we tried: our last chance, preserve only the last revision...
|
|
|
|
|
# config['curonly'] means that the whole dump is configured to save only the last,
|
|
|
|
|
# params['curonly'] should mean that we've already tried this
|
|
|
|
|
# fallback, because it's set by the following if and passed to
|
|
|
|
|
# getXMLPageCore
|
|
|
|
|
# TODO: save only the last version when failed
|
|
|
|
|
print ' Saving in the errors log, and skipping...'
|
|
|
|
|
logerror(
|
|
|
|
|
config=config,
|
|
|
|
|
text=u'Error while retrieving the last revision of "%s". Skipping.' %
|
|
|
|
|
(params['titles' if config['apiquery'] else 'pages']).decode('utf-8'))
|
|
|
|
|
#raise ExportAbortedError(config['index'])
|
|
|
|
|
return '' # empty xml
|
|
|
|
|
|
|
|
|
|
# FIXME HANDLE HTTP Errors HERE
|
|
|
|
|
try:
|
|
|
|
|
r = session.get(url=config['api'], params=params, headers=headers)
|
|
|
|
|
handleStatusCode(r)
|
|
|
|
|
xml = fixBOM(r)
|
|
|
|
|
#print xml
|
|
|
|
|
except requests.exceptions.ConnectionError as e:
|
|
|
|
|
print ' Connection error: %s'%(str(e[0]))
|
|
|
|
|
xml = ''
|
|
|
|
|
c += 1
|
|
|
|
|
return xml
|
|
|
|
|
def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
|
|
|
|
|
""" Get the full history (or current only) of a page using API:Query
|
|
|
|
|
if params['curonly'] is set, then using export&exportwrap to export
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
title_ = title
|
|
|
|
|
title_ = re.sub(' ', '_', title_)
|
|
|
|
|
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
|
|
|
|
|
# action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
|
|
|
|
|
# &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
|
|
|
|
|
#print 'current:%s' % (title_)
|
|
|
|
|
if not config['curonly']:
|
|
|
|
|
params = {'titles': title_, 'action': 'query','format':'xml',
|
|
|
|
|
'prop':'revisions',
|
|
|
|
|
'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
|
|
|
|
|
'rvcontinue' : None,
|
|
|
|
|
'rvlimit' : 10 # TODO: set this by commandline
|
|
|
|
|
}
|
|
|
|
|
else:
|
|
|
|
|
params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
|
|
|
|
|
#print 'params:%s' % (params)
|
|
|
|
|
if not config['curonly']:
|
|
|
|
|
firstpartok = False
|
|
|
|
|
lastcontinue = None
|
|
|
|
|
numberofedits = 0
|
|
|
|
|
ret = ''
|
|
|
|
|
while True:
|
|
|
|
|
# in case the last request is not right, saving last time's progress
|
|
|
|
|
if not firstpartok:
|
|
|
|
|
try:
|
|
|
|
|
lastcontinue = params['rvcontinue']
|
|
|
|
|
except:
|
|
|
|
|
lastcontinue = None
|
|
|
|
|
|
|
|
|
|
xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
|
|
|
|
|
if xml == "":
|
|
|
|
|
#just return so that we can continue, and getXMLPageCoreWithApi will log the error
|
|
|
|
|
return
|
|
|
|
|
try:
|
|
|
|
|
root = ET.fromstring(xml.encode('utf-8'))
|
|
|
|
|
except:
|
|
|
|
|
continue
|
|
|
|
|
try:
|
|
|
|
|
retpage = root.find('query').find('pages').find('page')
|
|
|
|
|
except:
|
|
|
|
|
continue
|
|
|
|
|
if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
|
|
|
|
|
print 'Page not found'
|
|
|
|
|
raise PageMissingError(params['titles'], xml)
|
|
|
|
|
if not firstpartok:
|
|
|
|
|
try:
|
|
|
|
|
# build the firstpart by ourselves to improve the memory usage
|
|
|
|
|
ret = ' <page>\n'
|
|
|
|
|
ret += ' <title>%s</title>\n' %(retpage.attrib['title'])
|
|
|
|
|
ret += ' <ns>%s</ns>\n' % (retpage.attrib['ns'])
|
|
|
|
|
ret += ' <id>%s</id>\n' % (retpage.attrib['pageid'])
|
|
|
|
|
except:
|
|
|
|
|
firstpartok = False
|
|
|
|
|
continue
|
|
|
|
|
else:
|
|
|
|
|
firstpartok = True
|
|
|
|
|
yield ret
|
|
|
|
|
try:
|
|
|
|
|
ret = ''
|
|
|
|
|
edits = 0
|
|
|
|
|
if config['curonly'] or root.find('continue') == None:
|
|
|
|
|
# transform the revision
|
|
|
|
|
rev_,edits = reconstructRevisions(root=root)
|
|
|
|
|
xmldom = MD.parseString('<stub1>'+ET.tostring(rev_)+'</stub1>')
|
|
|
|
|
# convert it into text in case it throws MemoryError
|
|
|
|
|
# delete the first three line and last two line,which is for setting the indent
|
|
|
|
|
ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
|
|
|
|
|
yield ret
|
|
|
|
|
numberofedits += edits
|
|
|
|
|
break
|
|
|
|
|
else:
|
|
|
|
|
rev_,edits = reconstructRevisions(root=root)
|
|
|
|
|
xmldom = MD.parseString('<stub1>' + ET.tostring(rev_) + '</stub1>')
|
|
|
|
|
ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
|
|
|
|
|
params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
|
|
|
|
|
numberofedits += edits
|
|
|
|
|
yield ret
|
|
|
|
|
except:
|
|
|
|
|
traceback.print_exc()
|
|
|
|
|
params['rvcontinue'] = lastcontinue
|
|
|
|
|
ret = ''
|
|
|
|
|
yield ' </page>\n'
|
|
|
|
|
else:
|
|
|
|
|
xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
|
|
|
|
|
if xml == "":
|
|
|
|
|
raise ExportAbortedError(config['index'])
|
|
|
|
|
if not "</page>" in xml:
|
|
|
|
|
raise PageMissingError(params['titles'], xml)
|
|
|
|
|
else:
|
|
|
|
|
# strip these sha1s sums which keep showing up in the export and
|
|
|
|
|
# which are invalid for the XML schema (they only apply to
|
|
|
|
|
# revisions)
|
|
|
|
|
xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
|
|
|
|
|
xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
|
|
|
|
|
|
|
|
|
|
yield xml.split("</page>")[0]
|
|
|
|
|
|
|
|
|
|
# just for looking good :)
|
|
|
|
|
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
|
|
|
|
|
|
|
|
|
|
numberofedits = 0
|
|
|
|
|
numberofedits += len(re.findall(r_timestamp, xml))
|
|
|
|
|
|
|
|
|
|
yield "</page>\n"
|
|
|
|
|
|
|
|
|
|
if verbose:
|
|
|
|
|
if (numberofedits == 1):
|
|
|
|
|
print ' %s, 1 edit' % (title.strip())
|
|
|
|
|
else:
|
|
|
|
|
print ' %s, %d edits' % (title.strip(), numberofedits)
|
|
|
|
|
|
|
|
|
|
def getXMLPageCore(headers={}, params={}, config={}, session=None):
|
|
|
|
|
""" """
|
|
|
|
@ -587,7 +807,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
|
|
|
|
|
xml = ''
|
|
|
|
|
except requests.exceptions.ReadTimeout as e:
|
|
|
|
|
print ' Read timeout: %s'%(str(e[0]))
|
|
|
|
|
xml = ''
|
|
|
|
|
xml = ''
|
|
|
|
|
c += 1
|
|
|
|
|
|
|
|
|
|
return xml
|
|
|
|
@ -694,7 +914,14 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
|
|
|
|
|
print ' %s, 1 edit' % (title.strip())
|
|
|
|
|
else:
|
|
|
|
|
print ' %s, %d edits' % (title.strip(), numberofedits)
|
|
|
|
|
|
|
|
|
|
def selectXMLQueryMode(config={}, title='', verbose=True, session=None):
|
|
|
|
|
if config['apiquery']:
|
|
|
|
|
#Using api.php?Query instead of relying on Special:Export
|
|
|
|
|
return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
|
|
|
|
|
else:
|
|
|
|
|
#Using the traditional method(default)
|
|
|
|
|
return getXMLPage(config=config, title=title, verbose=verbose, session=session)
|
|
|
|
|
return ''
|
|
|
|
|
|
|
|
|
|
def makeXmlPageFromRaw(xml):
|
|
|
|
|
""" Discard the metadata around a <page> element in <mediawiki> string"""
|
|
|
|
@ -775,7 +1002,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
|
|
|
|
|
if c % 10 == 0:
|
|
|
|
|
print 'Downloaded %d pages' % (c)
|
|
|
|
|
try:
|
|
|
|
|
for xml in getXMLPage(config=config, title=title, session=session):
|
|
|
|
|
for xml in selectXMLQueryMode(config=config, title=title, session=session):
|
|
|
|
|
xml = cleanXML(xml=xml)
|
|
|
|
|
xmlfile.write(xml.encode('utf-8'))
|
|
|
|
|
except PageMissingError:
|
|
|
|
@ -902,7 +1129,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
# repeated header is confusing and would not even be valid
|
|
|
|
|
xml = exportrequest['query']['export']['*']
|
|
|
|
|
yield makeXmlPageFromRaw(xml)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'continue' in arvrequest:
|
|
|
|
|
# Get the new ones
|
|
|
|
|
arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
|
|
|
|
@ -924,7 +1151,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
|
|
|
|
|
else:
|
|
|
|
|
# End of continuation. We are done with this namespace.
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
except (KeyError, mwclient.errors.InvalidResponse) as e:
|
|
|
|
|
print(e)
|
|
|
|
|
# TODO: check whether the KeyError was really for a missing arv API
|
|
|
|
@ -1156,7 +1383,7 @@ def reverse_readline(filename, buf_size=8192, truncate=False):
|
|
|
|
|
if segment is not None:
|
|
|
|
|
# if the previous chunk starts right from the beginning of line
|
|
|
|
|
# do not concat the segment to the last line of new chunk
|
|
|
|
|
# instead, yield the segment first
|
|
|
|
|
# instead, yield the segment first
|
|
|
|
|
if buffer[-1] is not '\n':
|
|
|
|
|
lines[-1] += segment
|
|
|
|
|
else:
|
|
|
|
@ -1684,6 +1911,7 @@ def getParameters(params=[]):
|
|
|
|
|
action='store_true',
|
|
|
|
|
help='resumes previous incomplete dump (requires --path)')
|
|
|
|
|
parser.add_argument('--force', action='store_true', help='')
|
|
|
|
|
parser.add_argument('--ignore-api-check', action='store_true', help='')
|
|
|
|
|
parser.add_argument(
|
|
|
|
|
'--user', help='Username if authentication is required.')
|
|
|
|
|
parser.add_argument(
|
|
|
|
@ -1715,6 +1943,10 @@ def getParameters(params=[]):
|
|
|
|
|
help="generates a full history XML dump (--xml --curonly for current revisions only)")
|
|
|
|
|
groupDownload.add_argument('--curonly', action='store_true',
|
|
|
|
|
help='store only the current version of pages')
|
|
|
|
|
groupDownload.add_argument(
|
|
|
|
|
'--apiquery',
|
|
|
|
|
action='store_true',
|
|
|
|
|
help="EXPERIMENTAL: Using api.php?query instead of Special:Export to export pages, works with: --curonly,--xmlrevisions,--images")
|
|
|
|
|
groupDownload.add_argument('--xmlrevisions', action='store_true',
|
|
|
|
|
help='download all revisions from an API generator. MediaWiki 1.27+ only.')
|
|
|
|
|
groupDownload.add_argument(
|
|
|
|
@ -1764,6 +1996,10 @@ def getParameters(params=[]):
|
|
|
|
|
print getWikiEngine(url=args.wiki)
|
|
|
|
|
sys.exit()
|
|
|
|
|
|
|
|
|
|
if (args.apiquery and not args.curonly) and (args.apiquery and not args.xmlrevisions) and (args.apiquery and not args.images):
|
|
|
|
|
print('ERROR: --apiquery requires either --curonly or --images or --xmlrevisions')
|
|
|
|
|
sys.exit()
|
|
|
|
|
|
|
|
|
|
# Create session
|
|
|
|
|
cj = cookielib.MozillaCookieJar()
|
|
|
|
|
if args.cookies:
|
|
|
|
@ -1828,6 +2064,8 @@ def getParameters(params=[]):
|
|
|
|
|
index2 = check[1]
|
|
|
|
|
api = checkedapi
|
|
|
|
|
print 'API is OK: ' + checkedapi
|
|
|
|
|
elif args.ignore_api_check:
|
|
|
|
|
print 'Error in API. Ignoring.'
|
|
|
|
|
else:
|
|
|
|
|
if index and not args.wiki:
|
|
|
|
|
print 'API not available. Trying with index.php only.'
|
|
|
|
@ -1925,6 +2163,7 @@ def getParameters(params=[]):
|
|
|
|
|
'cookies': args.cookies or '',
|
|
|
|
|
'delay': args.delay,
|
|
|
|
|
'retries': int(args.retries),
|
|
|
|
|
'apiquery': args.apiquery,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
other = {
|
|
|
|
@ -2385,7 +2624,7 @@ def getWikiEngine(url=''):
|
|
|
|
|
session.headers.update({'User-Agent': getUserAgent()})
|
|
|
|
|
r = session.post(url=url, timeout=30)
|
|
|
|
|
if r.status_code == 405 or r.text == '':
|
|
|
|
|
r = session.get(url=url, timeout=120)
|
|
|
|
|
r = session.get(url=url, timeout=60)
|
|
|
|
|
result = r.text
|
|
|
|
|
|
|
|
|
|
wikiengine = 'Unknown'
|
|
|
|
@ -2468,7 +2707,7 @@ def mwGetAPIAndIndex(url=''):
|
|
|
|
|
index = ''
|
|
|
|
|
session = requests.Session()
|
|
|
|
|
session.headers.update({'User-Agent': getUserAgent()})
|
|
|
|
|
r = session.post(url=url, timeout=120)
|
|
|
|
|
r = session.post(url=url, timeout=60)
|
|
|
|
|
result = r.text
|
|
|
|
|
|
|
|
|
|
# API
|
|
|
|
|