Merge pull request #124 from balr0g/scraper-unicode-title-fix

Fix scraper for sites with Unicode titles
pull/125/head
nemobis 10 years ago
commit 1933db8a94

@ -286,9 +286,10 @@ def getPageTitlesScraper(config={}):
c = 0
m = re.compile(r_title).finditer(rawacum)
for i in m:
if not i.group('title').startswith('Special:'):
if not i.group('title') in titles:
titles.append(undoHTMLEntities(text=i.group('title')))
t = undoHTMLEntities(text=unicode(i.group('title'), 'utf-8'))
if not t.startswith('Special:'):
if not t in titles:
titles.append(t)
c += 1
print ' %d titles retrieved in the namespace %d' % (c, namespace)
return titles

Loading…
Cancel
Save