Download image description from API when using --xmlrevisions

Fixes https://github.com/WikiTeam/wikiteam/issues/308

Also add --failfast option to sneak in all the hacks I use to run
the bulk downloads, so I can more easily sync the repos.
pull/319/head
Federico Leva 6 years ago
parent be5ca12075
commit 33bb1c1f23

@ -442,34 +442,38 @@ def getXMLHeader(config={}, session=None):
# similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/" # similar to: <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.3/"
# xmlns:x.... # xmlns:x....
randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ randomtitle = 'Main_Page' # previously AMF5LKE43MNFGHKSDMRTJ
try: if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)]) r = session.get(config['api'] + '?action=query&revids=1&export&exportnowrap')
except PageMissingError as pme: xml = r.text
# The <page> does not exist. Not a problem, if we get the <siteinfo>. else:
xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError:
try: try:
if config['api']: xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
timeout=120
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme: except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml xml = pme.xml
# Issue 26: Account for missing "Special" namespace.
# Hope the canonical special name has not been removed.
# http://albens73.fr/wiki/api.php?action=query&meta=siteinfo&siprop=namespacealiases
except ExportAbortedError: except ExportAbortedError:
pass try:
if config['api']:
print "Trying the local name for the Special namespace instead"
r = session.post(
url=config['api'],
data={
'action': 'query',
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
timeout=120
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
header = xml.split('</mediawiki>')[0] header = xml.split('</mediawiki>')[0]
if not re.match(r"\s*<mediawiki", xml): if not re.match(r"\s*<mediawiki", xml):
@ -528,6 +532,9 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c >= maxretries: if c >= maxretries:
print ' We have retried %d times' % (c) print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['pages']) print ' MediaWiki error for "%s", network error or whatever...' % (params['pages'])
if config['failfast']:
print "Exit, it will be for another time"
sys.exit()
# If it's not already what we tried: our last chance, preserve only the last revision... # If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last, # config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this # params['curonly'] should mean that we've already tried this
@ -766,7 +773,6 @@ def getXMLRevisions(config={}, session=None):
params = { params = {
'action': 'query', 'action': 'query',
'list': 'allrevisions', 'list': 'allrevisions',
'arvnamespace': '*',
'arvlimit': 50, 'arvlimit': 50,
'arvprop': 'ids', 'arvprop': 'ids',
} }
@ -779,7 +785,7 @@ def getXMLRevisions(config={}, session=None):
for revision in page['revisions']: for revision in page['revisions']:
revids.append(str(revision['revid'])) revids.append(str(revision['revid']))
print "50 more revisions listed, until %d" % revids[-1] print "50 more revisions listed, until %s" % revids[-1]
exportparams = { exportparams = {
'action': 'query', 'action': 'query',
'revids': '|'.join(revids), 'revids': '|'.join(revids),
@ -1178,10 +1184,14 @@ def generateImageDump(config={}, other={}, images=[], start='', session=None):
# saving description if any # saving description if any
try: try:
title = u'Image:%s' % (filename) title = u'Image:%s' % (filename)
xmlfiledesc = getXMLFileDesc( if config['xmlrevisions'] and config['api'] and config['api'].endswith("api.php"):
config=config, r = session.get(config['api'] + u"?action=query&export&exportnowrap&titles=%s" % title)
title=title, xml = r.text
session=session) # use Image: for backwards compatibility else:
xmlfiledesc = getXMLFileDesc(
config=config,
title=title,
session=session) # use Image: for backwards compatibility
except PageMissingError: except PageMissingError:
xmlfiledesc = '' xmlfiledesc = ''
logerror( logerror(
@ -1389,6 +1399,10 @@ def getParameters(params=[]):
'--get-wiki-engine', '--get-wiki-engine',
action='store_true', action='store_true',
help="returns the wiki engine") help="returns the wiki engine")
groupMeta.add_argument(
'--failfast',
action='store_true',
help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.")
args = parser.parse_args() args = parser.parse_args()
# print args # print args
@ -1564,6 +1578,7 @@ def getParameters(params=[]):
'curonly': args.curonly, 'curonly': args.curonly,
'date': datetime.datetime.now().strftime('%Y%m%d'), 'date': datetime.datetime.now().strftime('%Y%m%d'),
'api': api, 'api': api,
'failfast': args.failfast,
'index': index, 'index': index,
'images': args.images, 'images': args.images,
'logs': False, 'logs': False,
@ -2127,7 +2142,10 @@ def main(params=[]):
# do not enter if resume is requested from begining # do not enter if resume is requested from begining
while not other['resume'] and os.path.isdir(config['path']): while not other['resume'] and os.path.isdir(config['path']):
print '\nWarning!: "%s" path exists' % (config['path']) print '\nWarning!: "%s" path exists' % (config['path'])
reply = '' if config['failfast']:
retry = 'yes'
else:
reply = ''
while reply.lower() not in ['yes', 'y', 'no', 'n']: while reply.lower() not in ['yes', 'y', 'no', 'n']:
reply = raw_input( reply = raw_input(
'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' % 'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? ' %

Loading…
Cancel
Save