pull/429/merge
Gernot Zacharias 2 years ago committed by GitHub
commit e80f4a41b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -23,10 +23,21 @@ try:
from kitchen.text.converters import getwriter, to_unicode
except ImportError:
print "Please install the kitchen module."
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
import xml.dom.minidom as MD
import cookielib
import cPickle
import datetime
import sys
import io
import traceback
try:
import argparse
except ImportError:
@ -63,7 +74,7 @@ except ImportError:
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org
__VERSION__ = '0.4.1-alpha' # major, minor, micro: semver.org
class PageMissingError(Exception):
def __init__(self, title, xml):
@ -164,7 +175,7 @@ def getNamespacesScraper(config={}, session=None):
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
r = session.post(
url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
url=config['index'], params={'title': 'Special:Allpages'}, timeout=60)
raw = r.text
delay(config=config, session=session)
@ -206,7 +217,7 @@ def getNamespacesAPI(config={}, session=None):
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
timeout=30
timeout=60
)
result = getJSON(r)
delay(config=config, session=session)
@ -281,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None):
print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (
config['index'], namespace)
r = session.get(url=url, timeout=30)
r = session.get(url=url, timeout=60)
raw = r.text
raw = cleanHTML(raw)
@ -455,7 +466,7 @@ def getXMLHeader(config={}, session=None):
else:
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
xml = "".join([x for x in selectXMLQueryMode(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
@ -473,11 +484,11 @@ def getXMLHeader(config={}, session=None):
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
timeout=120
timeout=60
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
xml = "".join([x for x in selectXMLQueryMode(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
@ -500,7 +511,7 @@ def getXMLHeader(config={}, session=None):
def getXMLFileDesc(config={}, title='', session=None):
""" Get XML for image description page """
config['curonly'] = 1 # tricky to get only the most recent desc
return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
return("".join([x for x in selectXMLQueryMode( config=config, title=title, verbose=False, session=session)]))
def getUserAgent():
@ -521,7 +532,216 @@ def logerror(config={}, text=''):
output = u'%s: %s\n' % (
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
outfile.write(output.encode('utf-8'))
def reconstructRevisions(root=None):
#print ET.tostring(rev)
page = ET.Element('stub')
edits = 0
for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
try:
rev_ = ET.SubElement(page,'revision')
ET.SubElement(rev_,'id').text = rev.attrib['revid']
ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
contributor = ET.SubElement(rev_,'contributor')
if not rev.attrib.has_key('userhidden'):
ET.SubElement(contributor,'username').text = rev.attrib['user']
ET.SubElement(contributor,'id').text = rev.attrib['userid']
else:
contributor.set('deleted','deleted')
comment = ET.SubElement(rev_,'comment')
if not rev.attrib.has_key('commenthidden'):
comment.text = rev.attrib['comment']
else:
comment.set('deleted','deleted')
# some revision does not return model and format, so just use hard-code
ET.SubElement(rev_,'model').text = 'wikitext'
ET.SubElement(rev_,'format').text = 'text/x-wiki'
text = ET.SubElement(rev_,'text')
if not rev.attrib.has_key('texthidden'):
text.attrib['xml:space'] = "preserve"
text.attrib['bytes'] = rev.attrib['size']
text.text = rev.text
else:
text.set('deleted','deleted')
# delete sha1 here :)
#sha1 = ET.SubElement(rev_,'sha1')
#if not rev.attrib.has_key('sha1missing'):
#sha1.text = rev.attrib['sha1']
if rev.attrib.has_key('minor'):
ET.SubElement(rev_,'minor')
edits += 1
except Exception as e:
#logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
print ET.tostring(rev)
traceback.print_exc()
page = None
edits = 0
raise e
return page,edits
def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
""" """
# just send the API request
# if it fails, it will reduce params['rvlimit']
xml = ''
c = 0
maxseconds = 100 # max seconds to wait in a single sleeping
maxretries = config['retries'] # x retries and skip
increment = 20 # increment every retry
while not re.search(r'</api>' if not config['curonly'] else r'</mediawiki>', xml) or re.search(r'</error>', xml):
if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiquery'] else 'pages'], wait)
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# rvlimit = 1 from mother function)
if params['rvlimit'] > 1:
params['rvlimit'] = params['rvlimit'] / 2 # half
if c >= maxretries:
print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiquery'] else 'pages'])
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# getXMLPageCore
# TODO: save only the last version when failed
print ' Saving in the errors log, and skipping...'
logerror(
config=config,
text=u'Error while retrieving the last revision of "%s". Skipping.' %
(params['titles' if config['apiquery'] else 'pages']).decode('utf-8'))
#raise ExportAbortedError(config['index'])
return '' # empty xml
# FIXME HANDLE HTTP Errors HERE
try:
r = session.get(url=config['api'], params=params, headers=headers)
handleStatusCode(r)
xml = fixBOM(r)
#print xml
except requests.exceptions.ConnectionError as e:
print ' Connection error: %s'%(str(e[0]))
xml = ''
c += 1
return xml
def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
""" Get the full history (or current only) of a page using API:Query
if params['curonly'] is set, then using export&exportwrap to export
"""
title_ = title
title_ = re.sub(' ', '_', title_)
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
# action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
# &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
#print 'current:%s' % (title_)
if not config['curonly']:
params = {'titles': title_, 'action': 'query','format':'xml',
'prop':'revisions',
'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
'rvcontinue' : None,
'rvlimit' : 10 # TODO: set this by commandline
}
else:
params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
#print 'params:%s' % (params)
if not config['curonly']:
firstpartok = False
lastcontinue = None
numberofedits = 0
ret = ''
while True:
# in case the last request is not right, saving last time's progress
if not firstpartok:
try:
lastcontinue = params['rvcontinue']
except:
lastcontinue = None
xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
if xml == "":
#just return so that we can continue, and getXMLPageCoreWithApi will log the error
return
try:
root = ET.fromstring(xml.encode('utf-8'))
except:
continue
try:
retpage = root.find('query').find('pages').find('page')
except:
continue
if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
print 'Page not found'
raise PageMissingError(params['titles'], xml)
if not firstpartok:
try:
# build the firstpart by ourselves to improve the memory usage
ret = ' <page>\n'
ret += ' <title>%s</title>\n' %(retpage.attrib['title'])
ret += ' <ns>%s</ns>\n' % (retpage.attrib['ns'])
ret += ' <id>%s</id>\n' % (retpage.attrib['pageid'])
except:
firstpartok = False
continue
else:
firstpartok = True
yield ret
try:
ret = ''
edits = 0
if config['curonly'] or root.find('continue') == None:
# transform the revision
rev_,edits = reconstructRevisions(root=root)
xmldom = MD.parseString('<stub1>'+ET.tostring(rev_)+'</stub1>')
# convert it into text in case it throws MemoryError
# delete the first three line and last two line,which is for setting the indent
ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
yield ret
numberofedits += edits
break
else:
rev_,edits = reconstructRevisions(root=root)
xmldom = MD.parseString('<stub1>' + ET.tostring(rev_) + '</stub1>')
ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
numberofedits += edits
yield ret
except:
traceback.print_exc()
params['rvcontinue'] = lastcontinue
ret = ''
yield ' </page>\n'
else:
xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config['index'])
if not "</page>" in xml:
raise PageMissingError(params['titles'], xml)
else:
# strip these sha1s sums which keep showing up in the export and
# which are invalid for the XML schema (they only apply to
# revisions)
xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
yield xml.split("</page>")[0]
# just for looking good :)
r_timestamp = r'<timestamp>([^<]+)</timestamp>'
numberofedits = 0
numberofedits += len(re.findall(r_timestamp, xml))
yield "</page>\n"
if verbose:
if (numberofedits == 1):
print ' %s, 1 edit' % (title.strip())
else:
print ' %s, %d edits' % (title.strip(), numberofedits)
def getXMLPageCore(headers={}, params={}, config={}, session=None):
""" """
@ -587,7 +807,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
xml = ''
except requests.exceptions.ReadTimeout as e:
print ' Read timeout: %s'%(str(e[0]))
xml = ''
xml = ''
c += 1
return xml
@ -694,7 +914,14 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
print ' %s, 1 edit' % (title.strip())
else:
print ' %s, %d edits' % (title.strip(), numberofedits)
def selectXMLQueryMode(config={}, title='', verbose=True, session=None):
if config['apiquery']:
#Using api.php?Query instead of relying on Special:Export
return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
else:
#Using the traditional method(default)
return getXMLPage(config=config, title=title, verbose=verbose, session=session)
return ''
def makeXmlPageFromRaw(xml):
""" Discard the metadata around a <page> element in <mediawiki> string"""
@ -775,7 +1002,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
for xml in selectXMLQueryMode(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
@ -902,7 +1129,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
# repeated header is confusing and would not even be valid
xml = exportrequest['query']['export']['*']
yield makeXmlPageFromRaw(xml)
if 'continue' in arvrequest:
# Get the new ones
arvparams['arvcontinue'] = arvrequest['continue']['arvcontinue']
@ -924,7 +1151,7 @@ def getXMLRevisions(config={}, session=None, allpages=False, start=None):
else:
# End of continuation. We are done with this namespace.
break
except (KeyError, mwclient.errors.InvalidResponse) as e:
print(e)
# TODO: check whether the KeyError was really for a missing arv API
@ -1156,7 +1383,7 @@ def reverse_readline(filename, buf_size=8192, truncate=False):
if segment is not None:
# if the previous chunk starts right from the beginning of line
# do not concat the segment to the last line of new chunk
# instead, yield the segment first
# instead, yield the segment first
if buffer[-1] is not '\n':
lines[-1] += segment
else:
@ -1684,6 +1911,7 @@ def getParameters(params=[]):
action='store_true',
help='resumes previous incomplete dump (requires --path)')
parser.add_argument('--force', action='store_true', help='')
parser.add_argument('--ignore-api-check', action='store_true', help='')
parser.add_argument(
'--user', help='Username if authentication is required.')
parser.add_argument(
@ -1715,6 +1943,10 @@ def getParameters(params=[]):
help="generates a full history XML dump (--xml --curonly for current revisions only)")
groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages')
groupDownload.add_argument(
'--apiquery',
action='store_true',
help="EXPERIMENTAL: Using api.php?query instead of Special:Export to export pages, works with: --curonly,--xmlrevisions,--images")
groupDownload.add_argument('--xmlrevisions', action='store_true',
help='download all revisions from an API generator. MediaWiki 1.27+ only.')
groupDownload.add_argument(
@ -1764,6 +1996,10 @@ def getParameters(params=[]):
print getWikiEngine(url=args.wiki)
sys.exit()
if (args.apiquery and not args.curonly) and (args.apiquery and not args.xmlrevisions) and (args.apiquery and not args.images):
print('ERROR: --apiquery requires either --curonly or --images or --xmlrevisions')
sys.exit()
# Create session
cj = cookielib.MozillaCookieJar()
if args.cookies:
@ -1828,6 +2064,8 @@ def getParameters(params=[]):
index2 = check[1]
api = checkedapi
print 'API is OK: ' + checkedapi
elif args.ignore_api_check:
print 'Error in API. Ignoring.'
else:
if index and not args.wiki:
print 'API not available. Trying with index.php only.'
@ -1925,6 +2163,7 @@ def getParameters(params=[]):
'cookies': args.cookies or '',
'delay': args.delay,
'retries': int(args.retries),
'apiquery': args.apiquery,
}
other = {
@ -2385,7 +2624,7 @@ def getWikiEngine(url=''):
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url, timeout=30)
if r.status_code == 405 or r.text == '':
r = session.get(url=url, timeout=120)
r = session.get(url=url, timeout=60)
result = r.text
wikiengine = 'Unknown'
@ -2468,7 +2707,7 @@ def mwGetAPIAndIndex(url=''):
index = ''
session = requests.Session()
session.headers.update({'User-Agent': getUserAgent()})
r = session.post(url=url, timeout=120)
r = session.post(url=url, timeout=60)
result = r.text
# API

Loading…
Cancel
Save