First attempt of API-only export

pull/287/merge
Fedora 6 years ago
parent 142b48cc69
commit a8cbb357ff

@ -39,11 +39,16 @@ except ImportError: # Python 2.4 compatibility
from md5 import new as md5
import os
import re
import subprocess
try:
import requests
except ImportError:
print "Please install or update the Requests module."
sys.exit(1)
try:
import wikitools
except ImportError:
print "Please install the wikitools 1.3+ module if you want to use --xmlrevisions."
import time
import urllib
UTF8Writer = getwriter('utf8')
@ -514,7 +519,7 @@ def getXMLPageCore(headers={}, params={}, config={}, session=None):
if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['pages'], wait)
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...' %(c, params['pages'], wait)
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function)
@ -677,10 +682,9 @@ def cleanXML(xml=''):
def generateXMLDump(config={}, titles=[], start=None, session=None):
""" Generates a XML dump for a list of titles """
""" Generates a XML dump for a list of titles or from revision IDs """
# TODO: titles is now unused.
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
header, config = getXMLHeader(config=config, session=session)
footer = '</mediawiki>\n' # new line at the end
xmlfilename = '%s-%s-%s.xml' % (domain2prefix(config=config),
@ -688,48 +692,100 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
config['curonly'] and 'current' or 'history')
xmlfile = ''
lock = True
if start:
print "Removing the last chunk of past XML dump: it is probably incomplete."
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
if config['xmlrevisions']:
print 'Retrieving the XML for every page from the beginning'
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
for xml in getXMLRevisions(config=config, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
except AttributeError:
print "This wikitools module version is not working"
sys.exit()
else:
print 'Retrieving the XML for every page from "%s"' % (start and start or 'start')
if start:
print "Removing the last chunk of past XML dump: it is probably incomplete."
for i in reverse_readline('%s/%s' % (config['path'], xmlfilename), truncate=True):
pass
else:
# requested complete xml dump
lock = False
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'w')
xmlfile.write(header.encode('utf-8'))
xmlfile.close()
xmlfile = open('%s/%s' % (config['path'], xmlfilename), 'a')
c = 1
for title in readTitles(config, start):
if not title.strip():
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
delay(config=config, session=session)
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
logerror(
config=config,
text=u'The page "%s" was missing in the wiki (probably deleted)' %
(title.decode('utf-8'))
)
# here, XML is a correct <page> </page> chunk or
# an empty string due to a deleted page (logged in errors log) or
# an empty string due to an error while retrieving the page from server
# (logged in errors log)
c += 1
xmlfile.write(footer)
xmlfile.close()
print 'XML dump saved at...', xmlfilename
def getXMLRevisions(config={}, session=None):
site = wikitools.wiki.Wiki(config['api'])
if config['namespaces']:
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
else:
namespaces = ['*']
for namespace in namespaces:
params = {
'action': 'query',
'generator': 'allrevisions',
'garvnamespace': namespace,
'garvlimit': 50,
'garvprop': 'ids',
'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
}
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
try:
for result in results:
yield result['query']['export']['*']
except wikitools.api.APIError:
# Falling back to allpages generator, the wiki is too old
params = {
'action': 'query',
'generator': 'allpages',
'gaplimit': 50,
'export': 1 # Just to make sure the parameter is passed. Empty is fine too.
}
# allpages does not accept "*"
if namespace is not '*':
params['gapnamespace'] = namespace
request = wikitools.api.APIRequest(site, params)
results = request.queryGen()
for result in results:
yield result['query']['export']['*']
def readTitles(config={}, start=None):
""" Read title list from a file, from the title "start" """
@ -1303,7 +1359,9 @@ def getParameters(params=[]):
action='store_true',
help="generates a full history XML dump (--xml --curonly for current revisions only)")
groupDownload.add_argument('--curonly', action='store_true',
help='store only the current version of pages')
help='store only the current version of pages; incompatible with --xmlrevisions')
groupDownload.add_argument('--xmlrevisions', action='store_true',
help='download all revisions from an API generator')
groupDownload.add_argument(
'--images', action='store_true', help="generates an image dump")
groupDownload.add_argument(
@ -1502,6 +1560,7 @@ def getParameters(params=[]):
'images': args.images,
'logs': False,
'xml': args.xml,
'xmlrevisions': args.xmlrevisions,
'namespaces': namespaces,
'exnamespaces': exnamespaces,
'path': args.path and os.path.normpath(args.path) or '',
@ -1547,7 +1606,7 @@ def checkAPI(api=None, session=None):
try:
result = getJSON(r)
index = None
if result['query']:
if result:
try:
index = result['query']['general']['server'] + \
result['query']['general']['script']
@ -1886,7 +1945,7 @@ def saveSiteInfo(config={}, session=None):
'action': 'query',
'meta': 'siteinfo',
'siprop': 'general|namespaces',
'format': 'json'}
'format': 'json'},
timeout=10)
result = getJSON(r)
delay(config=config, session=session)

Loading…
Cancel
Save