Fix getPageTitlesScraper

Using the API and the Special:Allpages scraper should result in the same number of titles.
Fix the detection of the next subpages on Special:Allpages.
Change the max depth to 100 and implement an anti loop (could fail on non-western wiki).
pull/394/head
Nicolas SAPA 4 years ago
parent 1048bc3275
commit b289f86243

@ -299,9 +299,10 @@ def getPageTitlesScraper(config={}, session=None):
else:
pass # perhaps no subpages
# 3 is the current deep of English Wikipedia for Special:Allpages
deep = 3
# Should be enought subpages on Special:Allpages
deep = 50
c = 0
oldfr = ''
checked_suballpages = []
rawacum = raw
while r_suballpages and re.search(r_suballpages, raw) and c < deep:
@ -309,6 +310,11 @@ def getPageTitlesScraper(config={}, session=None):
m = re.compile(r_suballpages).finditer(raw)
for i in m:
fr = i.group('from')
currfr = fr
if oldfr == currfr:
# We are looping, exit the loop
pass
if r_suballpages == r_suballpages1:
to = i.group('to')
@ -329,19 +335,23 @@ def getPageTitlesScraper(config={}, session=None):
url = '%s?title=Special:Allpages&from=%s&namespace=%s' % (
config['index'], name, namespace)
if name not in checked_suballpages:
# to avoid reload dupe subpages links
checked_suballpages.append(name)
delay(config=config, session=session)
r2 = session.get(url=url, timeout=10)
raw2 = r2.text
raw2 = cleanHTML(raw2)
rawacum += raw2 # merge it after removed junk
print ' Reading', name, len(raw2), 'bytes', \
len(re.findall(r_suballpages, raw2)), 'subpages', \
len(re.findall(r_title, raw2)), 'pages'
r = session.get(url=url, timeout=10)
#print 'Fetching URL: ', url
raw = r.text
raw = cleanHTML(raw)
rawacum += raw # merge it after removed junk
print ' Reading', name, len(raw), 'bytes', \
len(re.findall(r_suballpages, raw)), 'subpages', \
len(re.findall(r_title, raw)), 'pages'
delay(config=config, session=session)
oldfr = currfr
c += 1
c = 0

Loading…
Cancel
Save