mirror of https://github.com/WikiTeam/wikiteam
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
393 lines
17 KiB
Python
393 lines
17 KiB
Python
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from typing import *
|
|
from urllib.parse import urlparse
|
|
|
|
import lxml.etree
|
|
import mwclient
|
|
import requests
|
|
|
|
from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI
|
|
from wikiteam3.dumpgenerator.api.page_titles import readTitles
|
|
from wikiteam3.dumpgenerator.config import Config
|
|
from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import (
|
|
makeXmlFromPage,
|
|
makeXmlPageFromRaw,
|
|
)
|
|
from wikiteam3.dumpgenerator.exceptions import PageMissingError
|
|
from wikiteam3.dumpgenerator.log import logerror
|
|
|
|
ALL_NAMESPACE = -1
|
|
|
|
|
|
def getXMLRevisionsByAllRevisions(
|
|
config: Config = None,
|
|
session=None,
|
|
site: mwclient.Site = None,
|
|
nscontinue=None,
|
|
arvcontinue=None,
|
|
):
|
|
if "all" not in config.namespaces:
|
|
namespaces = config.namespaces
|
|
else:
|
|
# namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
|
|
namespaces = [ALL_NAMESPACE] # magic number refers to "all"
|
|
_nscontinue = nscontinue
|
|
_arvcontinue = arvcontinue
|
|
|
|
for namespace in namespaces:
|
|
# Skip retrived namespace
|
|
if namespace == ALL_NAMESPACE:
|
|
assert (
|
|
len(namespaces) == 1
|
|
), "Only one item shoule be there when 'all' namespace are specified"
|
|
_nscontinue = None
|
|
elif _nscontinue is not None:
|
|
if namespace != _nscontinue:
|
|
print("Skipping already exported namespace: %d" % namespace)
|
|
continue
|
|
_nscontinue = None
|
|
|
|
print(f"Trying to export all revisions from namespace {namespace}")
|
|
# arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
|
|
arvparams = {
|
|
"action": "query",
|
|
"list": "allrevisions",
|
|
"arvlimit": config.api_chunksize,
|
|
"arvdir": "newer",
|
|
}
|
|
if namespace != ALL_NAMESPACE:
|
|
arvparams["arvnamespace"] = namespace
|
|
if _arvcontinue is not None:
|
|
arvparams["arvcontinue"] = _arvcontinue
|
|
|
|
if not config.curonly:
|
|
# We have to build the XML manually...
|
|
# Skip flags, presumably needed to add <minor/> which is in the schema.
|
|
# Also missing: parentid and contentformat.
|
|
arvparams[
|
|
"arvprop"
|
|
] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags"
|
|
print(
|
|
"Trying to get wikitext from the allrevisions API and to build the XML"
|
|
)
|
|
while True:
|
|
try:
|
|
arvrequest = site.api(http_method=config.http_method, **arvparams)
|
|
except requests.exceptions.HTTPError as e:
|
|
if e.response.status_code != 405 or config.http_method != "POST":
|
|
raise
|
|
print("POST request to the API failed, retrying with GET")
|
|
config.http_method = "GET"
|
|
continue
|
|
except requests.exceptions.ReadTimeout as err:
|
|
# Hopefully temporary, just wait a bit and continue with the same request.
|
|
# No point putting a limit to retries, we'd need to abort everything.
|
|
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
|
|
# to use the retry adapter we use for our own requests session?
|
|
print(f"ERROR: {str(err)}")
|
|
print("Sleeping for 20 seconds")
|
|
time.sleep(20)
|
|
continue
|
|
except mwclient.errors.InvalidResponse as e:
|
|
if (
|
|
not e.response_text.startswith("<!DOCTYPE html>")
|
|
or config.http_method != "POST"
|
|
):
|
|
raise
|
|
|
|
print(
|
|
"POST request to the API failed (got HTML), retrying with GET"
|
|
)
|
|
config.http_method = "GET"
|
|
continue
|
|
for page in arvrequest["query"]["allrevisions"]:
|
|
yield makeXmlFromPage(page, arvparams.get("arvcontinue", ""))
|
|
if "continue" in arvrequest:
|
|
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
|
|
else:
|
|
# End of continuation. We are done with this namespace.
|
|
break
|
|
|
|
else:
|
|
# FIXME: this is not curonly, just different strategy to do all revisions
|
|
# Just cycle through revision IDs and use the XML as is
|
|
print("Trying to list the revisions and to export them one by one")
|
|
# We only need the revision ID, all the rest will come from the raw export
|
|
arvparams["arvprop"] = "ids"
|
|
try:
|
|
arvrequest = site.api(http_method=config.http_method, **arvparams)
|
|
except requests.exceptions.HTTPError as e:
|
|
if e.response.status_code != 405 or config.http_method != "POST":
|
|
raise
|
|
print("POST request to the API failed, retrying with GET")
|
|
config.http_method = "GET"
|
|
continue
|
|
exportparams = {
|
|
"action": "query",
|
|
"export": "1",
|
|
}
|
|
# Skip the namespace if it's empty
|
|
if len(arvrequest["query"]["allrevisions"]) < 1:
|
|
continue
|
|
# Repeat the arvrequest with new arvparams until done
|
|
while True:
|
|
# Reset revision IDs from the previous batch from arv
|
|
revids = []
|
|
for page in arvrequest["query"]["allrevisions"]:
|
|
revids.extend(
|
|
str(revision["revid"]) for revision in page["revisions"]
|
|
)
|
|
print(
|
|
" %d more revisions listed, until %s"
|
|
% (len(revids), revids[-1])
|
|
)
|
|
|
|
# We can now get the XML for one revision at a time
|
|
# FIXME: we can actually get them in batches as we used to
|
|
# but need to figure out the continuation and avoid that the API
|
|
# chooses to give us only the latest for each page
|
|
for revid in revids:
|
|
exportparams["revids"] = revid
|
|
try:
|
|
exportrequest = site.api(
|
|
http_method=config.http_method, **exportparams
|
|
)
|
|
except requests.exceptions.HTTPError as e:
|
|
if (
|
|
e.response.status_code != 405
|
|
or config.http_method != "POST"
|
|
):
|
|
raise
|
|
|
|
print("POST request to the API failed, retrying with GET")
|
|
config.http_method = "GET"
|
|
exportrequest = site.api(
|
|
http_method=config.http_method, **exportparams
|
|
)
|
|
# This gives us a self-standing <mediawiki> element
|
|
# but we only need the inner <page>: we can live with
|
|
# duplication and non-ordering of page titles, but the
|
|
# repeated header is confusing and would not even be valid
|
|
xml = exportrequest["query"]["export"]["*"] # type(xml) == str
|
|
yield makeXmlPageFromRaw(xml, arvparams.get("arvcontinue", ""))
|
|
|
|
if "continue" not in arvrequest:
|
|
# End of continuation. We are done with this namespace.
|
|
break
|
|
# Get the new ones
|
|
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
|
|
try:
|
|
arvrequest = site.api(http_method=config.http_method, **arvparams)
|
|
except requests.exceptions.HTTPError as e:
|
|
if e.response.status_code == 405 and config.http_method == "POST":
|
|
print("POST request to the API failed, retrying with GET")
|
|
config.http_method = "GET"
|
|
arvrequest = site.api(
|
|
http_method=config.http_method, **arvparams
|
|
)
|
|
except requests.exceptions.ReadTimeout as err:
|
|
# As above
|
|
print(f"ERROR: {str(err)}")
|
|
print("Sleeping for 20 seconds")
|
|
time.sleep(20)
|
|
# But avoid rewriting the same revisions
|
|
arvrequest["query"]["allrevisions"] = []
|
|
|
|
|
|
def getXMLRevisionsByTitles(
|
|
config: Config = None, session=None, site: mwclient.Site = None, start=None
|
|
):
|
|
c = 0
|
|
if config.curonly:
|
|
# The raw XML export in the API gets a title and gives the latest revision.
|
|
# We could also use the allpages API as generator but let's be consistent.
|
|
print("Getting titles to export the latest revision for each")
|
|
for title in readTitles(config, session=session, start=start):
|
|
# TODO: respect verbose flag, reuse output from getXMLPage
|
|
print(f" {title}")
|
|
# TODO: as we're doing one page and revision at a time, we might
|
|
# as well use xml format and exportnowrap=1 to use the string of,
|
|
# XML as is, but need to check how well the library handles it.
|
|
exportparams = {
|
|
"action": "query",
|
|
"titles": title,
|
|
"export": "1",
|
|
}
|
|
try:
|
|
exportrequest = site.api(http_method=config.http_method, **exportparams)
|
|
except requests.exceptions.HTTPError as e:
|
|
if e.response.status_code != 405 or config.http_method != "POST":
|
|
raise
|
|
|
|
print("POST request to the API failed, retrying with GET")
|
|
config.http_method = "GET"
|
|
exportrequest = site.api(http_method=config.http_method, **exportparams)
|
|
xml = str(exportrequest["query"]["export"]["*"])
|
|
c += 1
|
|
if c % 10 == 0:
|
|
print(f"\n-> Downloaded {c} pages\n")
|
|
# Because we got the fancy XML from the JSON format, clean it:
|
|
yield makeXmlPageFromRaw(xml, None)
|
|
else:
|
|
# This is the closest to what we usually do with Special:Export:
|
|
# take one title at a time and try to get all revisions exported.
|
|
# It differs from the allrevisions method because it actually needs
|
|
# to be input the page titles; otherwise, the requests are similar.
|
|
# The XML needs to be made manually because the export=1 option
|
|
# refuses to return an arbitrary number of revisions (see above).
|
|
print("Getting titles to export all the revisions of each")
|
|
titlelist = []
|
|
# TODO: Decide a suitable number of a batched request. Careful:
|
|
# batched responses may not return all revisions.
|
|
for titlelist in readTitles(config, session=session, start=start, batch=False):
|
|
if type(titlelist) is not list:
|
|
titlelist = [titlelist]
|
|
for title in titlelist:
|
|
print(f" {title}")
|
|
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
|
|
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
|
|
pparams = {
|
|
"action": "query",
|
|
"titles": "|".join(titlelist),
|
|
"prop": "revisions",
|
|
"rvlimit": config.api_chunksize,
|
|
"rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags",
|
|
}
|
|
try:
|
|
prequest = site.api(http_method=config.http_method, **pparams)
|
|
except requests.exceptions.HTTPError as e:
|
|
if e.response.status_code != 405 or config.http_method != "POST":
|
|
raise
|
|
print("POST request to the API failed, retrying with GET")
|
|
config.http_method = "GET"
|
|
prequest = site.api(http_method=config.http_method, **pparams)
|
|
except mwclient.errors.InvalidResponse:
|
|
logerror(
|
|
config=config,
|
|
to_stdout=True,
|
|
text=f'Error: page inaccessible? Could not export page: {"; ".join(titlelist)}',
|
|
)
|
|
continue
|
|
|
|
# Be ready to iterate if there is continuation.
|
|
while True:
|
|
# Get the revision data returned by the API: prequest is the initial request
|
|
# or the new one after continuation at the bottom of this while loop.
|
|
# The array is called "pages" even if there's only one.
|
|
try:
|
|
pages = prequest["query"]["pages"]
|
|
except KeyError:
|
|
logerror(
|
|
config=config,
|
|
to_stdout=True,
|
|
text=f'Error: page inaccessible? Could not export page: {"; ".join(titlelist)}',
|
|
)
|
|
break
|
|
# Go through the data we got to build the XML.
|
|
for pageid in pages:
|
|
try:
|
|
yield makeXmlFromPage(pages[pageid], None)
|
|
except PageMissingError:
|
|
logerror(
|
|
config=config,
|
|
to_stdout=True,
|
|
text=f'Error: empty revision from API. Could not export page: {"; ".join(titlelist)}',
|
|
)
|
|
continue
|
|
|
|
# Get next batch of revisions if there's more.
|
|
if "continue" in prequest.keys():
|
|
print("Getting more revisions for the page")
|
|
for key, value in prequest["continue"].items():
|
|
pparams[key] = value
|
|
elif "query-continue" in prequest.keys():
|
|
rvstartid = prequest["query-continue"]["revisions"]["rvstartid"]
|
|
pparams["rvstartid"] = rvstartid
|
|
else:
|
|
break
|
|
|
|
try:
|
|
prequest = site.api(http_method=config.http_method, **pparams)
|
|
except requests.exceptions.HTTPError as e:
|
|
if e.response.status_code == 405 and config.http_method == "POST":
|
|
print("POST request to the API failed, retrying with GET")
|
|
config.http_method = "GET"
|
|
prequest = site.api(http_method=config.http_method, **pparams)
|
|
|
|
# We're done iterating for this title or titles.
|
|
c += len(titlelist)
|
|
# Reset for the next batch.
|
|
titlelist = []
|
|
if c % 10 == 0:
|
|
print(f"\n-> Downloaded {c} pages\n")
|
|
|
|
|
|
def getXMLRevisions(
|
|
config: Config = None, session=None, useAllrevision=True, lastPage=None
|
|
):
|
|
# FIXME: actually figure out the various strategies for each MediaWiki version
|
|
apiurl = urlparse(config.api)
|
|
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
|
|
# https://github.com/WikiTeam/wikiteam/issues/358
|
|
site = mwclient.Site(
|
|
apiurl.netloc,
|
|
apiurl.path.replace("api.php", ""),
|
|
scheme=apiurl.scheme,
|
|
pool=session,
|
|
)
|
|
|
|
if useAllrevision:
|
|
# Find last title
|
|
if lastPage is not None:
|
|
try:
|
|
lastNs = int(lastPage.find("ns").text)
|
|
lastArvcontinue = lastPage.attrib["arvcontinue"]
|
|
except Exception:
|
|
print(
|
|
f"Failed to find title in last trunk XML: {lxml.etree.tostring(lastPage)}"
|
|
)
|
|
raise
|
|
nscontinue = lastNs
|
|
arvcontinue = lastArvcontinue
|
|
if not arvcontinue:
|
|
arvcontinue = None
|
|
else:
|
|
nscontinue = None
|
|
arvcontinue = None
|
|
|
|
try:
|
|
return getXMLRevisionsByAllRevisions(
|
|
config, session, site, nscontinue, arvcontinue
|
|
)
|
|
except (KeyError, mwclient.errors.InvalidResponse) as e:
|
|
print(e)
|
|
# TODO: check whether the KeyError was really for a missing arv API
|
|
print(
|
|
"Warning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page"
|
|
)
|
|
sys.exit()
|
|
else:
|
|
# Find last title
|
|
if lastPage is not None:
|
|
try:
|
|
start = lastPage.find("title")
|
|
except Exception:
|
|
print(
|
|
f"Failed to find title in last trunk XML: {lxml.etree.tostring(lastPage)}"
|
|
)
|
|
raise
|
|
else:
|
|
start = None
|
|
|
|
try:
|
|
# # Uncomment these lines to raise an KeyError for testing
|
|
# raise KeyError(999999)
|
|
# # DO NOT UNCOMMMENT IN RELEASE
|
|
return getXMLRevisionsByTitles(config, session, site, start)
|
|
except mwclient.errors.MwClientError as e:
|
|
print(e)
|
|
print("This mwclient version seems not to work for us. Exiting.")
|
|
sys.exit()
|