mirror of
https://github.com/WikiTeam/wikiteam
synced 2024-11-04 12:00:28 +00:00
another case of subpages in allpages
git-svn-id: https://wikiteam.googlecode.com/svn/trunk@41 31edc4fc-5e31-b4c4-d58b-c8bc928bcb95
This commit is contained in:
parent
6f336e8237
commit
fe94af27c7
@ -93,25 +93,40 @@ def getPageTitles(config={}, start='!'):
|
||||
raw = cleanHTML(raw)
|
||||
|
||||
r_title = r'title="(?P<title>[^>]+)">'
|
||||
r_suballpages = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">'
|
||||
r_suballpages = ''
|
||||
r_suballpages1 = r'&from=(?P<from>[^>]+)&to=(?P<to>[^>]+)">'
|
||||
r_suballpages2 = r'Special:Allpages/(?P<from>[^>]+)">'
|
||||
if re.search(r_suballpages1, raw):
|
||||
r_suballpages = r_suballpages1
|
||||
elif re.search(r_suballpages2, raw):
|
||||
r_suballpages = r_suballpages2
|
||||
else:
|
||||
pass #perhaps no subpages
|
||||
|
||||
deep = 3 # 3 is the current deep of English Wikipedia for Special:Allpages, 3 levels
|
||||
c = 0
|
||||
checked_suballpages = []
|
||||
rawacum = raw
|
||||
while re.search(r_suballpages, raw) and c < deep:
|
||||
while r_suballpages and re.search(r_suballpages, raw) and c < deep:
|
||||
#load sub-Allpages
|
||||
m = re.compile(r_suballpages).finditer(raw)
|
||||
for i in m:
|
||||
fr = i.group('from')
|
||||
to = i.group('to')
|
||||
name = '%s-%s' % (fr, to)
|
||||
|
||||
if r_suballpages == r_suballpages1:
|
||||
to = i.group('to')
|
||||
name = '%s-%s' % (fr, to)
|
||||
url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (config['domain'], namespace, fr, to) #do not put urllib.quote in fr or to
|
||||
elif r_suballpages == r_suballpages2:
|
||||
name = fr
|
||||
url = '%s?title=Special:Allpages/%s&namespace=%s' % (config['domain'], name, namespace)
|
||||
|
||||
if not name in checked_suballpages:
|
||||
checked_suballpages.append(name)
|
||||
url = '%s?title=Special:Allpages&namespace=%s&from=%s&to=%s' % (config['domain'], namespace, fr, to) #do not put urllib.quote in fr or to
|
||||
raw2 = urllib.urlopen(url).read()
|
||||
raw2 = cleanHTML(raw2)
|
||||
rawacum += raw2 #merge it after removed junk
|
||||
print ' Detected sub-Allpages:', name, len(raw2), 'bytes', len(re.findall(r_title, raw2))
|
||||
print ' Reading', name, len(raw2), 'bytes', len(re.findall(r_suballpages, raw2)), 'subpages', len(re.findall(r_title, raw2)), 'pages'
|
||||
c += 1
|
||||
|
||||
m = re.compile(r_title).finditer(rawacum)
|
||||
|
Loading…
Reference in New Issue
Block a user