mirror of https://github.com/WikiTeam/wikiteam
Refactor xml dump related codes and improve xmlrevision logic (#103)
parent
0fb53ffdde
commit
be983b0814
@ -0,0 +1,398 @@
|
||||
from datetime import datetime
|
||||
from typing import *
|
||||
import sys
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
import lxml.etree
|
||||
|
||||
import mwclient
|
||||
import requests
|
||||
|
||||
from wikiteam3.dumpgenerator.exceptions import PageMissingError
|
||||
from wikiteam3.dumpgenerator.log import logerror
|
||||
from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI
|
||||
from wikiteam3.dumpgenerator.api.page_titles import readTitles
|
||||
from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import makeXmlFromPage, makeXmlPageFromRaw
|
||||
from wikiteam3.dumpgenerator.config import Config
|
||||
|
||||
def getXMLRevisionsByAllRevisions(config: Config=None, session=None, site: mwclient.Site=None, nscontinue=None, arvcontinue=None):
|
||||
if "all" not in config.namespaces:
|
||||
namespaces = config.namespaces
|
||||
else:
|
||||
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
|
||||
|
||||
_nscontinue = nscontinue
|
||||
_arvcontinue = arvcontinue
|
||||
|
||||
for namespace in namespaces:
|
||||
if _nscontinue is not None:
|
||||
if namespace != _nscontinue:
|
||||
print("Skipping already exported namespace: %d" % namespace)
|
||||
continue
|
||||
_nscontinue = None
|
||||
|
||||
print("Trying to export all revisions from namespace %s" % namespace)
|
||||
# arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
|
||||
arvparams = {
|
||||
"action": "query",
|
||||
"list": "allrevisions",
|
||||
"arvlimit": config.api_chunksize,
|
||||
"arvnamespace": namespace,
|
||||
}
|
||||
if _arvcontinue is not None:
|
||||
arvparams['arvcontinue'] = _arvcontinue
|
||||
if not config.curonly:
|
||||
# We have to build the XML manually...
|
||||
# Skip flags, presumably needed to add <minor/> which is in the schema.
|
||||
# Also missing: parentid and contentformat.
|
||||
arvparams[
|
||||
"arvprop"
|
||||
] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content"
|
||||
print(
|
||||
"Trying to get wikitext from the allrevisions API and to build the XML"
|
||||
)
|
||||
while True:
|
||||
try:
|
||||
arvrequest = site.api(
|
||||
http_method=config.http_method, **arvparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
except requests.exceptions.ReadTimeout as err:
|
||||
# Hopefully temporary, just wait a bit and continue with the same request.
|
||||
# No point putting a limit to retries, we'd need to abort everything.
|
||||
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
|
||||
# to use the retry adapter we use for our own requests session?
|
||||
print(f"ERROR: {str(err)}")
|
||||
print("Sleeping for 20 seconds")
|
||||
time.sleep(20)
|
||||
continue
|
||||
|
||||
for page in arvrequest["query"]["allrevisions"]:
|
||||
yield makeXmlFromPage(page, arvparams.get("arvcontinue", ""))
|
||||
if "continue" in arvrequest:
|
||||
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
|
||||
else:
|
||||
# End of continuation. We are done with this namespace.
|
||||
break
|
||||
|
||||
else:
|
||||
# FIXME: this is not curonly, just different strategy to do all revisions
|
||||
# Just cycle through revision IDs and use the XML as is
|
||||
print("Trying to list the revisions and to export them one by one")
|
||||
# We only need the revision ID, all the rest will come from the raw export
|
||||
arvparams["arvprop"] = "ids"
|
||||
try:
|
||||
arvrequest = site.api(
|
||||
http_method=config.http_method, **arvparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
exportparams = {
|
||||
"action": "query",
|
||||
"export": "1",
|
||||
}
|
||||
# Skip the namespace if it's empty
|
||||
if len(arvrequest["query"]["allrevisions"]) < 1:
|
||||
continue
|
||||
# Repeat the arvrequest with new arvparams until done
|
||||
while True:
|
||||
# Reset revision IDs from the previous batch from arv
|
||||
revids = []
|
||||
for page in arvrequest["query"]["allrevisions"]:
|
||||
for revision in page["revisions"]:
|
||||
revids.append(str(revision["revid"]))
|
||||
print(
|
||||
" %d more revisions listed, until %s"
|
||||
% (len(revids), revids[-1])
|
||||
)
|
||||
|
||||
# We can now get the XML for one revision at a time
|
||||
# FIXME: we can actually get them in batches as we used to
|
||||
# but need to figure out the continuation and avoid that the API
|
||||
# chooses to give us only the latest for each page
|
||||
for revid in revids:
|
||||
exportparams["revids"] = revid
|
||||
try:
|
||||
exportrequest = site.api(
|
||||
http_method=config.http_method, **exportparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print(
|
||||
"POST request to the API failed, retrying with GET"
|
||||
)
|
||||
config.http_method = "GET"
|
||||
exportrequest = site.api(
|
||||
http_method=config.http_method, **exportparams
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
# This gives us a self-standing <mediawiki> element
|
||||
# but we only need the inner <page>: we can live with
|
||||
# duplication and non-ordering of page titles, but the
|
||||
# repeated header is confusing and would not even be valid
|
||||
xml = exportrequest["query"]["export"]["*"] # type(xml) == str
|
||||
yield makeXmlPageFromRaw(xml, arvparams.get("arvcontinue", ""))
|
||||
|
||||
if "continue" in arvrequest:
|
||||
# Get the new ones
|
||||
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
|
||||
try:
|
||||
arvrequest = site.api(
|
||||
http_method=config.http_method, **arvparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print(
|
||||
"POST request to the API failed, retrying with GET"
|
||||
)
|
||||
config.http_method = "GET"
|
||||
arvrequest = site.api(
|
||||
http_method=config.http_method, **arvparams
|
||||
)
|
||||
except requests.exceptions.ReadTimeout as err:
|
||||
# As above
|
||||
print(f"ERROR: {str(err)}")
|
||||
print("Sleeping for 20 seconds")
|
||||
time.sleep(20)
|
||||
# But avoid rewriting the same revisions
|
||||
arvrequest["query"]["allrevisions"] = []
|
||||
continue
|
||||
else:
|
||||
# End of continuation. We are done with this namespace.
|
||||
break
|
||||
|
||||
|
||||
def getXMLRevisionsByTitles(config: Config=None, session=None, site: mwclient.Site=None, start=None):
|
||||
if config.curonly:
|
||||
# The raw XML export in the API gets a title and gives the latest revision.
|
||||
# We could also use the allpages API as generator but let's be consistent.
|
||||
print("Getting titles to export the latest revision for each")
|
||||
c = 0
|
||||
for title in readTitles(config, session=session, start=start):
|
||||
# TODO: respect verbose flag, reuse output from getXMLPage
|
||||
print(f" {title}")
|
||||
# TODO: as we're doing one page and revision at a time, we might
|
||||
# as well use xml format and exportnowrap=1 to use the string of,
|
||||
# XML as is, but need to check how well the library handles it.
|
||||
exportparams = {
|
||||
"action": "query",
|
||||
"titles": title,
|
||||
"export": "1",
|
||||
}
|
||||
try:
|
||||
exportrequest = site.api(
|
||||
http_method=config.http_method, **exportparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
exportrequest = site.api(
|
||||
http_method=config.http_method, **exportparams
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
xml = str(exportrequest["query"]["export"]["*"])
|
||||
c += 1
|
||||
if c % 10 == 0:
|
||||
print(f"\n-> Downloaded {c} pages\n")
|
||||
# Because we got the fancy XML from the JSON format, clean it:
|
||||
yield makeXmlPageFromRaw(xml, None)
|
||||
else:
|
||||
# This is the closest to what we usually do with Special:Export:
|
||||
# take one title at a time and try to get all revisions exported.
|
||||
# It differs from the allrevisions method because it actually needs
|
||||
# to be input the page titles; otherwise, the requests are similar.
|
||||
# The XML needs to be made manually because the export=1 option
|
||||
# refuses to return an arbitrary number of revisions (see above).
|
||||
print("Getting titles to export all the revisions of each")
|
||||
c = 0
|
||||
titlelist = []
|
||||
# TODO: Decide a suitable number of a batched request. Careful:
|
||||
# batched responses may not return all revisions.
|
||||
for titlelist in readTitles(config, session=session, start=start, batch=False):
|
||||
if type(titlelist) is not list:
|
||||
titlelist = [titlelist]
|
||||
for title in titlelist:
|
||||
print(f" {title}")
|
||||
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
|
||||
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
|
||||
pparams = {
|
||||
"action": "query",
|
||||
"titles": "|".join(titlelist),
|
||||
"prop": "revisions",
|
||||
# 'rvlimit': 50,
|
||||
"rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content",
|
||||
}
|
||||
try:
|
||||
prequest = site.api(http_method=config.http_method, **pparams)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
prequest = site.api(
|
||||
http_method=config.http_method, **pparams
|
||||
)
|
||||
else:
|
||||
raise
|
||||
except mwclient.errors.InvalidResponse:
|
||||
logerror(
|
||||
config=config, to_stdout=True,
|
||||
text="Error: page inaccessible? Could not export page: %s"
|
||||
% ("; ".join(titlelist)),
|
||||
)
|
||||
continue
|
||||
|
||||
# Be ready to iterate if there is continuation.
|
||||
while True:
|
||||
# Get the revision data returned by the API: prequest is the initial request
|
||||
# or the new one after continuation at the bottom of this while loop.
|
||||
# The array is called "pages" even if there's only one.
|
||||
try:
|
||||
pages = prequest["query"]["pages"]
|
||||
except KeyError:
|
||||
logerror(
|
||||
config=config, to_stdout=True,
|
||||
text="Error: page inaccessible? Could not export page: %s"
|
||||
% ("; ".join(titlelist)),
|
||||
)
|
||||
break
|
||||
# Go through the data we got to build the XML.
|
||||
for pageid in pages:
|
||||
try:
|
||||
xml = makeXmlFromPage(pages[pageid], None)
|
||||
yield xml
|
||||
except PageMissingError:
|
||||
logerror(
|
||||
config=config, to_stdout=True,
|
||||
text="Error: empty revision from API. Could not export page: %s"
|
||||
% ("; ".join(titlelist)),
|
||||
)
|
||||
continue
|
||||
|
||||
# Get next batch of revisions if there's more.
|
||||
if "continue" in prequest.keys():
|
||||
print("Getting more revisions for the page")
|
||||
for key, value in prequest["continue"]:
|
||||
pparams[key] = value
|
||||
elif "query-continue" in prequest.keys():
|
||||
rvstartid = prequest["query-continue"]["revisions"]["rvstartid"]
|
||||
pparams["rvstartid"] = rvstartid
|
||||
else:
|
||||
break
|
||||
|
||||
try:
|
||||
prequest = site.api(
|
||||
http_method=config.http_method, **pparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
prequest = site.api(
|
||||
http_method=config.http_method, **pparams
|
||||
)
|
||||
|
||||
# We're done iterating for this title or titles.
|
||||
c += len(titlelist)
|
||||
# Reset for the next batch.
|
||||
titlelist = []
|
||||
if c % 10 == 0:
|
||||
print(f"\n-> Downloaded {c} pages\n")
|
||||
|
||||
|
||||
def getXMLRevisions(config: Config=None, session=None, useAllrevision=True, lastPage=None):
|
||||
# FIXME: actually figure out the various strategies for each MediaWiki version
|
||||
apiurl = urlparse(config.api)
|
||||
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
|
||||
# https://github.com/WikiTeam/wikiteam/issues/358
|
||||
site = mwclient.Site(
|
||||
apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session
|
||||
)
|
||||
|
||||
if useAllrevision:
|
||||
# Find last title
|
||||
if lastPage:
|
||||
try:
|
||||
lastNs = int(lastPage.find('ns').text)
|
||||
if False:
|
||||
lastRevision = lastPage.find('revision')
|
||||
lastTimestamp = lastRevision.find('timestamp').text
|
||||
lastRevid = int(lastRevision.find('id').text)
|
||||
lastDatetime = datetime.fromisoformat(lastTimestamp.rstrip('Z'))
|
||||
lastArvcontinue = lastDatetime.strftime("%Y%m%d%H%M%S") + '|' + str(lastRevid)
|
||||
else:
|
||||
lastArvcontinue = lastPage.attrib['arvcontinue']
|
||||
except Exception:
|
||||
print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
|
||||
raise
|
||||
nscontinue = lastNs
|
||||
arvcontinue = lastArvcontinue
|
||||
if not arvcontinue:
|
||||
arvcontinue = None
|
||||
else:
|
||||
nscontinue = None
|
||||
arvcontinue = None
|
||||
|
||||
try:
|
||||
return getXMLRevisionsByAllRevisions(config, session, site, nscontinue, arvcontinue)
|
||||
except (KeyError, mwclient.errors.InvalidResponse) as e:
|
||||
print(e)
|
||||
# TODO: check whether the KeyError was really for a missing arv API
|
||||
print("Warning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page")
|
||||
sys.exit()
|
||||
else:
|
||||
# Find last title
|
||||
if lastPage:
|
||||
try:
|
||||
start = lastPage.find('title')
|
||||
except Exception:
|
||||
print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
|
||||
raise
|
||||
else:
|
||||
start = None
|
||||
|
||||
try:
|
||||
# # Uncomment these lines to raise an KeyError for testing
|
||||
# raise KeyError(999999)
|
||||
# # DO NOT UNCOMMMENT IN RELEASE
|
||||
return getXMLRevisionsByTitles(config, session, site, start)
|
||||
except mwclient.errors.MwClientError as e:
|
||||
print(e)
|
||||
print("This mwclient version seems not to work for us. Exiting.")
|
||||
sys.exit()
|
@ -0,0 +1,104 @@
|
||||
from lxml import etree
|
||||
from lxml.builder import E
|
||||
|
||||
from wikiteam3.dumpgenerator.exceptions import PageMissingError
|
||||
|
||||
def makeXmlPageFromRaw(xml, arvcontinue) -> str:
|
||||
"""Discard the metadata around a <page> element in <mediawiki> string"""
|
||||
root = etree.XML(xml)
|
||||
find = etree.XPath("//*[local-name() = 'page']")
|
||||
page = find(root)[0]
|
||||
if arvcontinue is not None:
|
||||
page.attrib['arvcontinue'] = arvcontinue
|
||||
# The tag will inherit the namespace, like:
|
||||
# <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
# FIXME: pretty_print doesn't seem to work, only adds a newline
|
||||
return etree.tostring(page, pretty_print=True, encoding="unicode")
|
||||
|
||||
|
||||
def makeXmlFromPage(page: dict, arvcontinue) -> str:
|
||||
"""Output an XML document as a string from a page as in the API JSON"""
|
||||
try:
|
||||
p = E.page(
|
||||
E.title(str(page["title"])),
|
||||
E.ns(str(page["ns"])),
|
||||
E.id(str(page["pageid"])),
|
||||
)
|
||||
if arvcontinue is not None:
|
||||
p.attrib['arvcontinue'] = arvcontinue
|
||||
for rev in page["revisions"]:
|
||||
# Older releases like MediaWiki 1.16 do not return all fields.
|
||||
if "userid" in rev:
|
||||
userid = rev["userid"]
|
||||
else:
|
||||
userid = 0
|
||||
if "size" in rev:
|
||||
size = rev["size"]
|
||||
else:
|
||||
size = 0
|
||||
|
||||
# Create rev object
|
||||
revision = [E.id(str(rev["revid"])),
|
||||
E.timestamp(rev["timestamp"]),]
|
||||
|
||||
# The text, user, comment, sha1 may be deleted/suppressed
|
||||
if 'texthidden' in rev:
|
||||
revision.append(E.text(**{
|
||||
'bytes': str(size),
|
||||
'deleted': 'deleted',
|
||||
}))
|
||||
else:
|
||||
text = str(rev["*"])
|
||||
revision.append(E.text(text, **{
|
||||
'bytes': str(size),
|
||||
'{http://www.w3.org/XML/1998/namespace}space': 'preserve',
|
||||
}))
|
||||
|
||||
if not "user" in rev:
|
||||
if not "userhidden" in rev:
|
||||
print("Warning: user not hidden but missing user in pageid %d revid %d" % (page['pageid'], rev['revid']))
|
||||
revision.append(E.contributor(deleted="deleted"))
|
||||
else:
|
||||
revision.append(
|
||||
E.contributor(
|
||||
E.username(str(rev["user"])),
|
||||
E.id(str(userid)),
|
||||
)
|
||||
)
|
||||
|
||||
if not "sha1" in rev:
|
||||
if "sha1hidden" in rev:
|
||||
revision.append(E.sha1()) # stub
|
||||
else:
|
||||
# The sha1 may not have been backfilled on older wikis or lack for other reasons (Wikia).
|
||||
pass
|
||||
elif "sha1" in rev:
|
||||
revision.append(E.sha1(rev["sha1"]))
|
||||
|
||||
|
||||
if 'commenthidden' in rev:
|
||||
revision.append(E.comment(deleted="deleted"))
|
||||
elif "comment" in rev and rev["comment"]:
|
||||
revision.append(E.comment(str(rev["comment"])))
|
||||
|
||||
if "contentmodel" in rev:
|
||||
revision.append(E.model(rev["contentmodel"]))
|
||||
# Sometimes a missing parentid is not replaced with a 0 as it should.
|
||||
if "parentid" in rev:
|
||||
revision.append(E.parentid(str(rev["parentid"])))
|
||||
|
||||
# mwcli's dump.xml order
|
||||
revisionTags = ['id', 'parentid', 'timestamp', 'contributor', 'comment', 'origin', 'model', 'format', 'text', 'sha1']
|
||||
revisionElementsDict = {elem.tag: elem for elem in revision}
|
||||
_revision = E.revision()
|
||||
for tag in revisionTags:
|
||||
if tag in revisionElementsDict:
|
||||
_revision.append(revisionElementsDict.pop(tag))
|
||||
for elem in revisionElementsDict.values():
|
||||
_revision.append(elem)
|
||||
p.append(_revision)
|
||||
except KeyError as e:
|
||||
print(e)
|
||||
raise PageMissingError(page["title"], e)
|
||||
return etree.tostring(p, pretty_print=True, encoding="unicode")
|
||||
|
@ -0,0 +1,144 @@
|
||||
import re
|
||||
import sys
|
||||
from typing import *
|
||||
|
||||
import lxml.etree
|
||||
|
||||
from wikiteam3.dumpgenerator.cli import Delay
|
||||
from wikiteam3.utils import domain2prefix
|
||||
from wikiteam3.dumpgenerator.exceptions import PageMissingError
|
||||
from wikiteam3.dumpgenerator.log import logerror
|
||||
from wikiteam3.dumpgenerator.api.page_titles import readTitles
|
||||
from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
|
||||
from wikiteam3.dumpgenerator.config import Config
|
||||
from wikiteam3.utils import cleanXML, undoHTMLEntities
|
||||
from wikiteam3.dumpgenerator.dump.xmldump.xml_header import getXMLHeader
|
||||
from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions import getXMLRevisions
|
||||
from wikiteam3.dumpgenerator.dump.xmldump.xml_truncate import truncateXMLDump, parseLastPageChunk
|
||||
|
||||
def doXMLRevisionDump(config: Config=None, session=None, xmlfile=None, lastPage=None, useAllrevisions=False):
|
||||
try:
|
||||
r_timestamp = "<timestamp>([^<]+)</timestamp>"
|
||||
r_arvcontinue = '<page arvcontinue="(.*?)">'
|
||||
|
||||
lastArvcontinue = None
|
||||
for xml in getXMLRevisions(config=config, session=session, lastPage=lastPage, useAllrevision=useAllrevisions):
|
||||
numrevs = len(re.findall(r_timestamp, xml))
|
||||
arvcontinueRe = re.findall(r_arvcontinue, xml)
|
||||
if arvcontinueRe:
|
||||
curArvcontinue = arvcontinueRe[0]
|
||||
if lastArvcontinue != curArvcontinue:
|
||||
Delay(config=config, session=session)
|
||||
lastArvcontinue = curArvcontinue
|
||||
# Due to how generators work, it's expected this may be less
|
||||
xml = cleanXML(xml=xml)
|
||||
xmlfile.write(xml)
|
||||
|
||||
xmltitle = re.search(r"<title>([^<]+)</title>", xml)
|
||||
title = undoHTMLEntities(text=xmltitle.group(1))
|
||||
print(f'{title}, {numrevs} edits (--xmlrevisions)')
|
||||
# Delay(config=config, session=session)
|
||||
except AttributeError as e:
|
||||
print(e)
|
||||
print("This API library version is not working")
|
||||
sys.exit()
|
||||
except UnicodeEncodeError as e:
|
||||
print(e)
|
||||
|
||||
def doXMLExportDump(config: Config=None, session=None, xmlfile=None, lastPage=None):
|
||||
print(
|
||||
'\nRetrieving the XML for every page\n'
|
||||
)
|
||||
|
||||
lock = True
|
||||
start = None
|
||||
if lastPage:
|
||||
try:
|
||||
start = lastPage.find('title')
|
||||
except Exception:
|
||||
print("Failed to find title in last trunk XML: %s" % (lxml.etree.tostring(lastPage)))
|
||||
raise
|
||||
else:
|
||||
# requested complete xml dump
|
||||
lock = False
|
||||
|
||||
c = 1
|
||||
for title in readTitles(config, session=session, start=start):
|
||||
if not title:
|
||||
continue
|
||||
if title == start: # start downloading from start, included
|
||||
lock = False
|
||||
if lock:
|
||||
continue
|
||||
Delay(config=config, session=session)
|
||||
if c % 10 == 0:
|
||||
print(f"\n-> Downloaded {c} pages\n")
|
||||
try:
|
||||
for xml in getXMLPage(config=config, title=title, session=session):
|
||||
xml = cleanXML(xml=xml)
|
||||
xmlfile.write(xml)
|
||||
except PageMissingError:
|
||||
logerror(
|
||||
config=config, to_stdout=True,
|
||||
text='The page "%s" was missing in the wiki (probably deleted)'
|
||||
% title,
|
||||
)
|
||||
# here, XML is a correct <page> </page> chunk or
|
||||
# an empty string due to a deleted page (logged in errors log) or
|
||||
# an empty string due to an error while retrieving the page from server
|
||||
# (logged in errors log)
|
||||
c += 1
|
||||
|
||||
|
||||
def generateXMLDump(config: Config=None, resume=False, session=None):
|
||||
"""Generates a XML dump for a list of titles or from revision IDs"""
|
||||
|
||||
header, config = getXMLHeader(config=config, session=session)
|
||||
footer = "</mediawiki>\n" # new line at the end
|
||||
xmlfilename = "{}-{}-{}.xml".format(
|
||||
domain2prefix(config=config),
|
||||
config.date,
|
||||
"current" if config.curonly else "history",
|
||||
)
|
||||
xmlfile = None
|
||||
|
||||
lastPage = None
|
||||
lastPageChunk = None
|
||||
# start != None, means we are resuming a XML dump
|
||||
if resume:
|
||||
print(
|
||||
"Removing the last chunk of past XML dump: it is probably incomplete."
|
||||
)
|
||||
# truncate XML dump if it already exists
|
||||
lastPageChunk = truncateXMLDump("{}/{}".format(config.path, xmlfilename))
|
||||
if not lastPageChunk.strip():
|
||||
print("Last page chunk is NULL, we'll directly start a new dump!")
|
||||
resume = False
|
||||
lastPage = None
|
||||
else:
|
||||
lastPage = parseLastPageChunk(lastPageChunk)
|
||||
if lastPage is None:
|
||||
print("Failed to parse last page chunk: \n%s" % lastPageChunk)
|
||||
print("Cannot resume, exiting now!")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"WARNING: will try to start the download...")
|
||||
xmlfile = open(
|
||||
"{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
|
||||
)
|
||||
else:
|
||||
print("\nRetrieving the XML for every page from the beginning\n")
|
||||
xmlfile = open(
|
||||
"{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
|
||||
)
|
||||
xmlfile.write(header)
|
||||
|
||||
if config.xmlrevisions and not config.xmlrevisions_page:
|
||||
doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=True)
|
||||
elif config.xmlrevisions and config.xmlrevisions_page:
|
||||
doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=False)
|
||||
else: # --xml
|
||||
doXMLExportDump(config, session, xmlfile, lastPage)
|
||||
xmlfile.write(footer)
|
||||
xmlfile.close()
|
||||
print("XML dump saved at...", xmlfilename)
|
@ -1,117 +0,0 @@
|
||||
import re
|
||||
import sys
|
||||
from typing import *
|
||||
|
||||
from wikiteam3.dumpgenerator.cli import Delay
|
||||
from wikiteam3.utils import domain2prefix
|
||||
from wikiteam3.dumpgenerator.exceptions import PageMissingError
|
||||
from wikiteam3.dumpgenerator.log import logerror
|
||||
from wikiteam3.dumpgenerator.dump.page.page_titles import readTitles
|
||||
from wikiteam3.dumpgenerator.dump.page.page_xml import getXMLPage
|
||||
from wikiteam3.dumpgenerator.config import Config
|
||||
from wikiteam3.utils import cleanXML, undoHTMLEntities
|
||||
from .xml_header import getXMLHeader
|
||||
from .xml_revisions import getXMLRevisions
|
||||
from .xml_truncate import truncateXMLDump
|
||||
|
||||
def generateXMLDump(config: Config=None, titles: Iterable[str]=None, start=None, session=None):
|
||||
"""Generates a XML dump for a list of titles or from revision IDs"""
|
||||
# TODO: titles is now unused.
|
||||
|
||||
header, config = getXMLHeader(config=config, session=session)
|
||||
footer = "</mediawiki>\n" # new line at the end
|
||||
xmlfilename = "{}-{}-{}.xml".format(
|
||||
domain2prefix(config=config),
|
||||
config.date,
|
||||
"current" if config.curonly else "history",
|
||||
)
|
||||
xmlfile = ""
|
||||
lock = True
|
||||
|
||||
# start != None, means we are resuming a XML dump
|
||||
if start:
|
||||
print(
|
||||
"Removing the last chunk of past XML dump: it is probably incomplete."
|
||||
)
|
||||
# truncate XML dump if it already exists
|
||||
truncateXMLDump("{}/{}".format(config.path, xmlfilename))
|
||||
|
||||
if config.xmlrevisions:
|
||||
if start:
|
||||
print(f"WARNING: will try to start the download from title: {start}")
|
||||
xmlfile = open(
|
||||
"{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
|
||||
)
|
||||
else:
|
||||
print("\nRetrieving the XML for every page from the beginning\n")
|
||||
xmlfile = open(
|
||||
"{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
|
||||
)
|
||||
xmlfile.write(header)
|
||||
try:
|
||||
r_timestamp = "<timestamp>([^<]+)</timestamp>"
|
||||
for xml in getXMLRevisions(config=config, session=session, start=start):
|
||||
numrevs = len(re.findall(r_timestamp, xml))
|
||||
# Due to how generators work, it's expected this may be less
|
||||
xml = cleanXML(xml=xml)
|
||||
xmlfile.write(xml)
|
||||
|
||||
xmltitle = re.search(r"<title>([^<]+)</title>", xml)
|
||||
title = undoHTMLEntities(text=xmltitle.group(1))
|
||||
print(f'{title}, {numrevs} edits (--xmlrevisions)')
|
||||
Delay(config=config, session=session)
|
||||
except AttributeError as e:
|
||||
print(e)
|
||||
print("This API library version is not working")
|
||||
sys.exit()
|
||||
except UnicodeEncodeError as e:
|
||||
print(e)
|
||||
|
||||
else: # --xml
|
||||
print(
|
||||
'\nRetrieving the XML for every page from "%s"\n'
|
||||
% (start if start else "start")
|
||||
)
|
||||
|
||||
if not start:
|
||||
# requested complete xml dump
|
||||
lock = False
|
||||
xmlfile = open(
|
||||
"{}/{}".format(config.path, xmlfilename), "w", encoding="utf-8"
|
||||
)
|
||||
xmlfile.write(header)
|
||||
xmlfile.close()
|
||||
|
||||
xmlfile = open(
|
||||
"{}/{}".format(config.path, xmlfilename), "a", encoding="utf-8"
|
||||
)
|
||||
c = 1
|
||||
for title in readTitles(config, start):
|
||||
if not title:
|
||||
continue
|
||||
if title == start: # start downloading from start, included
|
||||
lock = False
|
||||
if lock:
|
||||
continue
|
||||
Delay(config=config, session=session)
|
||||
if c % 10 == 0:
|
||||
print(f"\n-> Downloaded {c} pages\n")
|
||||
try:
|
||||
for xml in getXMLPage(config=config, title=title, session=session):
|
||||
xml = cleanXML(xml=xml)
|
||||
xmlfile.write(xml)
|
||||
except PageMissingError:
|
||||
logerror(
|
||||
config=config, to_stdout=True,
|
||||
text='The page "%s" was missing in the wiki (probably deleted)'
|
||||
% title,
|
||||
)
|
||||
# here, XML is a correct <page> </page> chunk or
|
||||
# an empty string due to a deleted page (logged in errors log) or
|
||||
# an empty string due to an error while retrieving the page from server
|
||||
# (logged in errors log)
|
||||
c += 1
|
||||
|
||||
xmlfile.write(footer)
|
||||
xmlfile.close()
|
||||
print("XML dump saved at...", xmlfilename)
|
@ -1,342 +0,0 @@
|
||||
import sys
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import mwclient
|
||||
import requests
|
||||
|
||||
from wikiteam3.dumpgenerator.exceptions import PageMissingError
|
||||
from wikiteam3.dumpgenerator.log import logerror
|
||||
from .namespaces import getNamespacesAPI
|
||||
from wikiteam3.dumpgenerator.dump.page.page_titles import readTitles
|
||||
from wikiteam3.dumpgenerator.dump.page.page_xml import makeXmlFromPage, makeXmlPageFromRaw
|
||||
from wikiteam3.dumpgenerator.config import Config
|
||||
|
||||
def getXMLRevisions(config: Config=None, session=None, allpages=False, start=None):
|
||||
# FIXME: actually figure out the various strategies for each MediaWiki version
|
||||
apiurl = urlparse(config.api)
|
||||
# FIXME: force the protocol we asked for! Or don't verify SSL if we asked HTTP?
|
||||
# https://github.com/WikiTeam/wikiteam/issues/358
|
||||
site = mwclient.Site(
|
||||
apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, pool=session
|
||||
)
|
||||
|
||||
if "all" not in config.namespaces:
|
||||
namespaces = config.namespaces
|
||||
else:
|
||||
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
|
||||
|
||||
try:
|
||||
# # Uncomment these lines to raise an KeyError for testing
|
||||
# raise KeyError(999999)
|
||||
# # DO NOT UNCOMMMENT IN RELEASE
|
||||
|
||||
for namespace in namespaces:
|
||||
print("Trying to export all revisions from namespace %s" % namespace)
|
||||
# arvgeneratexml exists but was deprecated in 1.26 (while arv is from 1.27?!)
|
||||
arvparams = {
|
||||
"action": "query",
|
||||
"list": "allrevisions",
|
||||
"arvlimit": 50,
|
||||
"arvnamespace": namespace,
|
||||
}
|
||||
if not config.curonly:
|
||||
# We have to build the XML manually...
|
||||
# Skip flags, presumably needed to add <minor/> which is in the schema.
|
||||
# Also missing: parentid and contentformat.
|
||||
arvparams[
|
||||
"arvprop"
|
||||
] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content"
|
||||
print(
|
||||
"Trying to get wikitext from the allrevisions API and to build the XML"
|
||||
)
|
||||
while True:
|
||||
try:
|
||||
arvrequest = site.api(
|
||||
http_method=config.http_method, **arvparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
except requests.exceptions.ReadTimeout as err:
|
||||
# Hopefully temporary, just wait a bit and continue with the same request.
|
||||
# No point putting a limit to retries, we'd need to abort everything.
|
||||
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
|
||||
# to use the retry adapter we use for our own requests session?
|
||||
print(f"ERROR: {str(err)}")
|
||||
print("Sleeping for 20 seconds")
|
||||
time.sleep(20)
|
||||
continue
|
||||
|
||||
for page in arvrequest["query"]["allrevisions"]:
|
||||
yield makeXmlFromPage(page)
|
||||
if "continue" in arvrequest:
|
||||
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
|
||||
else:
|
||||
# End of continuation. We are done with this namespace.
|
||||
break
|
||||
|
||||
else:
|
||||
# FIXME: this is not curonly, just different strategy to do all revisions
|
||||
# Just cycle through revision IDs and use the XML as is
|
||||
print("Trying to list the revisions and to export them one by one")
|
||||
# We only need the revision ID, all the rest will come from the raw export
|
||||
arvparams["arvprop"] = "ids"
|
||||
try:
|
||||
arvrequest = site.api(
|
||||
http_method=config.http_method, **arvparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
continue
|
||||
else:
|
||||
raise
|
||||
exportparams = {
|
||||
"action": "query",
|
||||
"export": "1",
|
||||
}
|
||||
# Skip the namespace if it's empty
|
||||
if len(arvrequest["query"]["allrevisions"]) < 1:
|
||||
continue
|
||||
# Repeat the arvrequest with new arvparams until done
|
||||
while True:
|
||||
# Reset revision IDs from the previous batch from arv
|
||||
revids = []
|
||||
for page in arvrequest["query"]["allrevisions"]:
|
||||
for revision in page["revisions"]:
|
||||
revids.append(str(revision["revid"]))
|
||||
print(
|
||||
" %d more revisions listed, until %s"
|
||||
% (len(revids), revids[-1])
|
||||
)
|
||||
|
||||
# We can now get the XML for one revision at a time
|
||||
# FIXME: we can actually get them in batches as we used to
|
||||
# but need to figure out the continuation and avoid that the API
|
||||
# chooses to give us only the latest for each page
|
||||
for revid in revids:
|
||||
exportparams["revids"] = revid
|
||||
try:
|
||||
exportrequest = site.api(
|
||||
http_method=config.http_method, **exportparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print(
|
||||
"POST request to the API failed, retrying with GET"
|
||||
)
|
||||
config.http_method = "GET"
|
||||
exportrequest = site.api(
|
||||
http_method=config.http_method, **exportparams
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
# This gives us a self-standing <mediawiki> element
|
||||
# but we only need the inner <page>: we can live with
|
||||
# duplication and non-ordering of page titles, but the
|
||||
# repeated header is confusing and would not even be valid
|
||||
xml = exportrequest["query"]["export"]["*"] # type(xml) == str
|
||||
yield makeXmlPageFromRaw(xml)
|
||||
|
||||
if "continue" in arvrequest:
|
||||
# Get the new ones
|
||||
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
|
||||
try:
|
||||
arvrequest = site.api(
|
||||
http_method=config.http_method, **arvparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print(
|
||||
"POST request to the API failed, retrying with GET"
|
||||
)
|
||||
config.http_method = "GET"
|
||||
arvrequest = site.api(
|
||||
http_method=config.http_method, **arvparams
|
||||
)
|
||||
except requests.exceptions.ReadTimeout as err:
|
||||
# As above
|
||||
print(f"ERROR: {str(err)}")
|
||||
print("Sleeping for 20 seconds")
|
||||
time.sleep(20)
|
||||
# But avoid rewriting the same revisions
|
||||
arvrequest["query"]["allrevisions"] = []
|
||||
continue
|
||||
else:
|
||||
# End of continuation. We are done with this namespace.
|
||||
break
|
||||
|
||||
except (KeyError, mwclient.errors.InvalidResponse) as e:
|
||||
print(e)
|
||||
# TODO: check whether the KeyError was really for a missing arv API
|
||||
print("Warning. Could not use allrevisions. Wiki too old?")
|
||||
if config.curonly:
|
||||
# The raw XML export in the API gets a title and gives the latest revision.
|
||||
# We could also use the allpages API as generator but let's be consistent.
|
||||
print("Getting titles to export the latest revision for each")
|
||||
c = 0
|
||||
for title in readTitles(config, start=start):
|
||||
# TODO: respect verbose flag, reuse output from getXMLPage
|
||||
print(f" {title}")
|
||||
# TODO: as we're doing one page and revision at a time, we might
|
||||
# as well use xml format and exportnowrap=1 to use the string of,
|
||||
# XML as is, but need to check how well the library handles it.
|
||||
exportparams = {
|
||||
"action": "query",
|
||||
"titles": title,
|
||||
"export": "1",
|
||||
}
|
||||
try:
|
||||
exportrequest = site.api(
|
||||
http_method=config.http_method, **exportparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
exportrequest = site.api(
|
||||
http_method=config.http_method, **exportparams
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
xml = str(exportrequest["query"]["export"]["*"])
|
||||
c += 1
|
||||
if c % 10 == 0:
|
||||
print(f"\n-> Downloaded {c} pages\n")
|
||||
# Because we got the fancy XML from the JSON format, clean it:
|
||||
yield makeXmlPageFromRaw(xml)
|
||||
else:
|
||||
# This is the closest to what we usually do with Special:Export:
|
||||
# take one title at a time and try to get all revisions exported.
|
||||
# It differs from the allrevisions method because it actually needs
|
||||
# to be input the page titles; otherwise, the requests are similar.
|
||||
# The XML needs to be made manually because the export=1 option
|
||||
# refuses to return an arbitrary number of revisions (see above).
|
||||
print("Getting titles to export all the revisions of each")
|
||||
c = 0
|
||||
titlelist = []
|
||||
# TODO: Decide a suitable number of a batched request. Careful:
|
||||
# batched responses may not return all revisions.
|
||||
for titlelist in readTitles(config, start=start, batch=False):
|
||||
if type(titlelist) is not list:
|
||||
titlelist = [titlelist]
|
||||
for title in titlelist:
|
||||
print(f" {title}")
|
||||
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
|
||||
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
|
||||
pparams = {
|
||||
"action": "query",
|
||||
"titles": "|".join(titlelist),
|
||||
"prop": "revisions",
|
||||
# 'rvlimit': 50,
|
||||
"rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content",
|
||||
}
|
||||
try:
|
||||
prequest = site.api(http_method=config.http_method, **pparams)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
prequest = site.api(
|
||||
http_method=config.http_method, **pparams
|
||||
)
|
||||
else:
|
||||
raise
|
||||
except mwclient.errors.InvalidResponse:
|
||||
logerror(
|
||||
config=config, to_stdout=True,
|
||||
text="Error: page inaccessible? Could not export page: %s"
|
||||
% ("; ".join(titlelist)),
|
||||
)
|
||||
continue
|
||||
|
||||
# Be ready to iterate if there is continuation.
|
||||
while True:
|
||||
# Get the revision data returned by the API: prequest is the initial request
|
||||
# or the new one after continuation at the bottom of this while loop.
|
||||
# The array is called "pages" even if there's only one.
|
||||
try:
|
||||
pages = prequest["query"]["pages"]
|
||||
except KeyError:
|
||||
logerror(
|
||||
config=config, to_stdout=True,
|
||||
text="Error: page inaccessible? Could not export page: %s"
|
||||
% ("; ".join(titlelist)),
|
||||
)
|
||||
break
|
||||
# Go through the data we got to build the XML.
|
||||
for pageid in pages:
|
||||
try:
|
||||
xml = makeXmlFromPage(pages[pageid])
|
||||
yield xml
|
||||
except PageMissingError:
|
||||
logerror(
|
||||
config=config, to_stdout=True,
|
||||
text="Error: empty revision from API. Could not export page: %s"
|
||||
% ("; ".join(titlelist)),
|
||||
)
|
||||
continue
|
||||
|
||||
# Get next batch of revisions if there's more.
|
||||
if "continue" in prequest.keys():
|
||||
print("Getting more revisions for the page")
|
||||
for key, value in prequest["continue"]:
|
||||
pparams[key] = value
|
||||
elif "query-continue" in prequest.keys():
|
||||
rvstartid = prequest["query-continue"]["revisions"]["rvstartid"]
|
||||
pparams["rvstartid"] = rvstartid
|
||||
else:
|
||||
break
|
||||
|
||||
try:
|
||||
prequest = site.api(
|
||||
http_method=config.http_method, **pparams
|
||||
)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
if (
|
||||
e.response.status_code == 405
|
||||
and config.http_method == "POST"
|
||||
):
|
||||
print("POST request to the API failed, retrying with GET")
|
||||
config.http_method = "GET"
|
||||
prequest = site.api(
|
||||
http_method=config.http_method, **pparams
|
||||
)
|
||||
|
||||
# We're done iterating for this title or titles.
|
||||
c += len(titlelist)
|
||||
# Reset for the next batch.
|
||||
titlelist = []
|
||||
if c % 10 == 0:
|
||||
print(f"\n-> Downloaded {c} pages\n")
|
||||
|
||||
except mwclient.errors.MwClientError as e:
|
||||
print(e)
|
||||
print("This mwclient version seems not to work for us. Exiting.")
|
||||
sys.exit()
|
Loading…
Reference in New Issue