mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-10 13:10:27 +00:00
Get namespaces and images info from API. Code badly copied around, we should probably use modules such as http://packages.python.org/simplemediawiki/ to use API. Tested, seems to work.
Fixed some typos. git-svn-id: https://wikiteam.googlecode.com/svn/trunk@668 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
This commit is contained in:
parent
b8b2dbcf4a
commit
a8479f9936
103
dumpgenerator.py
103
dumpgenerator.py
@ -62,14 +62,12 @@ def cleanHTML(raw=''):
|
||||
raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[1].split('</article>')[0]
|
||||
else:
|
||||
print raw[:250]
|
||||
print 'This wiki doesn\'t use marks to split contain'
|
||||
print 'This wiki doesn\'t use marks to split content'
|
||||
sys.exit()
|
||||
return raw
|
||||
|
||||
def getNamespaces(config={}):
|
||||
""" """
|
||||
#fix get namespaces from a random Special:Export page, it is better
|
||||
#too from API http://wikiindex.org/api.php?action=query&meta=siteinfo&siprop=general|namespaces
|
||||
""" Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages. Function called if no API is available. """
|
||||
namespaces = config['namespaces']
|
||||
namespacenames = {0:''} # main is 0, no prefix
|
||||
if namespaces:
|
||||
@ -100,13 +98,44 @@ def getNamespaces(config={}):
|
||||
print '%d namespaces found' % (len(namespaces))
|
||||
return namespaces, namespacenames
|
||||
|
||||
def getNamespacesAPI(config={}):
|
||||
""" Uses the API to get the list of namespaces names and ids """
|
||||
namespaces = config['namespaces']
|
||||
namespacenames = {0:''} # main is 0, no prefix
|
||||
if namespaces:
|
||||
req = urllib2.Request(url=config['api'], data=urllib.urlencode({'action': 'query', 'meta': 'siteinfo', 'siprop': 'namespaces', 'format': 'xml'}), headers={'User-Agent': getUserAgent()})
|
||||
f = urllib2.urlopen(req)
|
||||
raw = f.read()
|
||||
f.close()
|
||||
|
||||
m = re.compile(r'<ns id="(?P<namespaceid>\d+)"[^>]*?/?>(?P<namespacename>[^<]+)?(</ns>)?').finditer(raw) # [^>]*? to include case="first-letter" canonical= etc.
|
||||
if 'all' in namespaces:
|
||||
namespaces = []
|
||||
for i in m:
|
||||
namespaces.append(int(i.group("namespaceid")))
|
||||
namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
|
||||
else:
|
||||
#check if those namespaces really exist in this wiki
|
||||
namespaces2 = []
|
||||
for i in m:
|
||||
if int(i.group("namespaceid")) in namespaces:
|
||||
namespaces2.append(int(i.group("namespaceid")))
|
||||
namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
|
||||
namespaces = namespaces2
|
||||
else:
|
||||
namespaces = [0]
|
||||
|
||||
namespaces = [i for i in set(namespaces)] #uniques
|
||||
print '%d namespaces found' % (len(namespaces))
|
||||
return namespaces, namespacenames
|
||||
|
||||
def getPageTitlesAPI(config={}):
|
||||
""" """
|
||||
""" Uses the API to get the list of page titles """
|
||||
titles = []
|
||||
namespaces, namespacenames = getNamespaces(config=config)
|
||||
namespaces, namespacenames = getNamespacesAPI(config=config)
|
||||
for namespace in namespaces:
|
||||
if namespace in config['exnamespaces']:
|
||||
print ' Skiping namespace =', namespace
|
||||
print ' Skipping namespace =', namespace
|
||||
continue
|
||||
|
||||
c = 0
|
||||
@ -456,7 +485,7 @@ def getImageFilenamesURL(config={}):
|
||||
f = urllib2.urlopen(req)
|
||||
raw = f.read()
|
||||
f.close()
|
||||
if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicated wiki
|
||||
if re.search(ur'(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)', raw): # delicate wiki
|
||||
if limit > 10:
|
||||
print 'Error: listing %d images in a chunk is not possible, trying tiny chunks' % (limit)
|
||||
limit = limit/10
|
||||
@ -520,6 +549,58 @@ def getImageFilenamesURL(config={}):
|
||||
images.sort()
|
||||
return images
|
||||
|
||||
def getImageFilenamesURLAPI(config={}):
|
||||
""" Retrieve file list: filename, url, uploader """
|
||||
print 'Retrieving image filenames'
|
||||
headers = {'User-Agent': getUserAgent()}
|
||||
aifrom = '!'
|
||||
images = []
|
||||
while aifrom:
|
||||
sys.stderr.write('.') #progress
|
||||
params = {'action': 'query', 'list': 'allimages', 'aiprop': 'url|user', 'aifrom': aifrom, 'format': 'xml', 'ailimit': 500}
|
||||
data = urllib.urlencode(params)
|
||||
req = urllib2.Request(url=config['api'], data=data, headers=headers)
|
||||
try:
|
||||
f = urllib2.urlopen(req)
|
||||
except:
|
||||
try:
|
||||
print 'Server is slow... Waiting some seconds and retrying...'
|
||||
time.sleep(10)
|
||||
f = urllib2.urlopen(req)
|
||||
except:
|
||||
print 'An error has occurred while retrieving page titles with API'
|
||||
print 'Please, resume the dump, --resume'
|
||||
sys.exit()
|
||||
xml = f.read()
|
||||
f.close()
|
||||
m = re.findall(r'<allimages aifrom="([^>]+)" />', xml)
|
||||
if m:
|
||||
aifrom = undoHTMLEntities(text=m[0]) #" = ", etc
|
||||
else:
|
||||
aifrom = ''
|
||||
m = re.compile(r'(?im)<img name="(?P<filename>[^"]+)"[^>]*user="(?P<uploader>[^"]+)"[^>]* url="(?P<url>[^"]+)"[^>]*/>').finditer(xml) # Retrieves a filename, uploader, url triple from the name, user, url field of the xml line; space before url needed to avoid getting the descriptionurl field instead.
|
||||
for i in m:
|
||||
url = i.group('url')
|
||||
if url[0] == '/' or (not url.startswith('http://') and not url.startswith('https://')): #is it a relative URL?
|
||||
if url[0] == '/': #slash is added later
|
||||
url = url[1:]
|
||||
domainalone = config['index'].split('://')[1].split('/')[0] #remove from :// (http or https) until the first / after domain
|
||||
url = '%s://%s/%s' % (config['index'].split('://')[0], domainalone, url) # concat http(s) + domain + relative url
|
||||
url = undoHTMLEntities(text=url)
|
||||
#url = urllib.unquote(url) #do not use unquote with url, it break some urls with odd chars
|
||||
url = re.sub(' ', '_', url)
|
||||
filename = re.sub('_', ' ', i.group('filename'))
|
||||
filename = undoHTMLEntities(text=filename)
|
||||
filename = urllib.unquote(filename)
|
||||
uploader = re.sub('_', ' ', i.group('uploader'))
|
||||
uploader = undoHTMLEntities(text=uploader)
|
||||
uploader = urllib.unquote(uploader)
|
||||
images.append([filename, url, uploader])
|
||||
|
||||
print ' Found %d images' % (len(images))
|
||||
images.sort()
|
||||
return images
|
||||
|
||||
def undoHTMLEntities(text=''):
|
||||
""" """
|
||||
text = re.sub('<', '<', text) # i guess only < > & " need conversion http://www.w3schools.com/html/html_entities.asp
|
||||
@ -951,6 +1032,9 @@ def main(params=[]):
|
||||
else:
|
||||
print 'Image list is incomplete. Reloading...'
|
||||
#do not resume, reload, to avoid inconsistences, deleted images or so
|
||||
if config['api']:
|
||||
images=getImageFilenamesURLAPI(config=config)
|
||||
else:
|
||||
images = getImageFilenamesURL(config=config)
|
||||
saveImageFilenamesURL(config=config, images=images)
|
||||
#checking images directory
|
||||
@ -991,6 +1075,9 @@ def main(params=[]):
|
||||
saveTitles(config=config, titles=titles)
|
||||
generateXMLDump(config=config, titles=titles)
|
||||
if config['images']:
|
||||
if config['api']:
|
||||
images += getImageFilenamesURLAPI(config=config)
|
||||
else:
|
||||
images += getImageFilenamesURL(config=config)
|
||||
saveImageFilenamesURL(config=config, images=images)
|
||||
generateImageDump(config=config, other=other, images=images)
|
||||
|
Loading…
Reference in New Issue
Block a user