fix: cannot retrieve all titles using the --index method

pull/475/head
yzqzss 1 year ago committed by yzqzss
parent 3d0520abb0
commit e3295a7564

@ -50,7 +50,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
namespaces, namespacenames = getNamespacesScraper(config=config, session=session)
for namespace in namespaces:
print(" Retrieving titles in the namespace", namespace)
url = "{}?title=Special:Allpages&namespace: Dict=None".format(
url = "{}?title=Special:Allpages&namespace={}".format(
config.index, namespace
)
r = session.get(url=url, timeout=30)
@ -92,7 +92,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
if r_suballpages == r_suballpages1:
to = i.group("to")
name = f"{fr}-{to}"
url = "{}?title=Special:Allpages&namespace: Dict=None&from: Dict=None&to: Dict=None".format(
url = "{}?title=Special:Allpages&namespace={}&from={}&to={}".format(
config.index,
namespace,
fr,
@ -104,7 +104,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
# clean &namespace=\d, sometimes happens
fr = fr.split("&namespace=")[0]
name = fr
url = "{}?title=Special:Allpages/{}&namespace: Dict=None".format(
url = "{}?title=Special:Allpages/{}&namespace={}".format(
config.index,
name,
namespace,
@ -112,7 +112,7 @@ def getPageTitlesScraper(config: Config=None, session=None):
elif r_suballpages == r_suballpages3:
fr = fr.split("&namespace=")[0]
name = fr
url = "{}?title=Special:Allpages&from: Dict=None&namespace: Dict=None".format(
url = "{}?title=Special:Allpages&from={}&namespace={}".format(
config.index,
name,
namespace,
@ -124,8 +124,8 @@ def getPageTitlesScraper(config: Config=None, session=None):
# to avoid reload dupe subpages links
checked_suballpages.append(name)
Delay(config=config, session=session)
r = session.get(url=url, timeout=10)
# print ('Fetching URL: ', url)
r = session.get(url=url, timeout=10)
raw = str(r.text)
raw = cleanHTML(raw)
rawacum += raw # merge it after removed junk

Loading…
Cancel
Save