|
|
|
@ -50,7 +50,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
|
|
|
|
|
namespaces, namespacenames = getNamespacesScraper(config=config, session=session)
|
|
|
|
|
for namespace in namespaces:
|
|
|
|
|
print(" Retrieving titles in the namespace", namespace)
|
|
|
|
|
url = "{}?title=Special:Allpages&namespace: Dict=None".format(
|
|
|
|
|
url = "{}?title=Special:Allpages&namespace={}".format(
|
|
|
|
|
config.index, namespace
|
|
|
|
|
)
|
|
|
|
|
r = session.get(url=url, timeout=30)
|
|
|
|
@ -92,7 +92,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
|
|
|
|
|
if r_suballpages == r_suballpages1:
|
|
|
|
|
to = i.group("to")
|
|
|
|
|
name = f"{fr}-{to}"
|
|
|
|
|
url = "{}?title=Special:Allpages&namespace: Dict=None&from: Dict=None&to: Dict=None".format(
|
|
|
|
|
url = "{}?title=Special:Allpages&namespace={}&from={}&to={}".format(
|
|
|
|
|
config.index,
|
|
|
|
|
namespace,
|
|
|
|
|
fr,
|
|
|
|
@ -104,7 +104,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
|
|
|
|
|
# clean &namespace=\d, sometimes happens
|
|
|
|
|
fr = fr.split("&namespace=")[0]
|
|
|
|
|
name = fr
|
|
|
|
|
url = "{}?title=Special:Allpages/{}&namespace: Dict=None".format(
|
|
|
|
|
url = "{}?title=Special:Allpages/{}&namespace={}".format(
|
|
|
|
|
config.index,
|
|
|
|
|
name,
|
|
|
|
|
namespace,
|
|
|
|
@ -112,7 +112,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
|
|
|
|
|
elif r_suballpages == r_suballpages3:
|
|
|
|
|
fr = fr.split("&namespace=")[0]
|
|
|
|
|
name = fr
|
|
|
|
|
url = "{}?title=Special:Allpages&from: Dict=None&namespace: Dict=None".format(
|
|
|
|
|
url = "{}?title=Special:Allpages&from={}&namespace={}".format(
|
|
|
|
|
config.index,
|
|
|
|
|
name,
|
|
|
|
|
namespace,
|
|
|
|
@ -124,8 +124,8 @@ def getPageTitlesScraper(config: Config=None, session=None):
|
|
|
|
|
# to avoid reload dupe subpages links
|
|
|
|
|
checked_suballpages.append(name)
|
|
|
|
|
Delay(config=config, session=session)
|
|
|
|
|
r = session.get(url=url, timeout=10)
|
|
|
|
|
# print ('Fetching URL: ', url)
|
|
|
|
|
r = session.get(url=url, timeout=10)
|
|
|
|
|
raw = str(r.text)
|
|
|
|
|
raw = cleanHTML(raw)
|
|
|
|
|
rawacum += raw # merge it after removed junk
|
|
|
|
|