mirror of https://github.com/WikiTeam/wikiteam
You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
454 lines
16 KiB
Python
454 lines
16 KiB
Python
|
|
import argparse
|
|
import datetime
|
|
import http
|
|
import http.cookiejar
|
|
import os
|
|
import queue
|
|
import re
|
|
import sys
|
|
|
|
import requests
|
|
import urllib3
|
|
|
|
from wikiteam3.dumpgenerator.api import checkRetryAPI, mwGetAPIAndIndex
|
|
from wikiteam3.utils.login import uniLogin
|
|
from .delay import Delay
|
|
from wikiteam3.utils import domain2prefix
|
|
from wikiteam3.dumpgenerator.api.index_check import checkIndex
|
|
from wikiteam3.utils import getUserAgent
|
|
from wikiteam3.dumpgenerator.version import getVersion
|
|
from wikiteam3.dumpgenerator.api import getWikiEngine
|
|
from wikiteam3.dumpgenerator.config import Config, newConfig
|
|
from wikiteam3.utils import mod_requests_text
|
|
|
|
from typing import *
|
|
|
|
from ...utils.user_agent import setupUserAgent
|
|
|
|
|
|
def getArgumentParser():
|
|
parser = argparse.ArgumentParser(description="")
|
|
|
|
# General params
|
|
parser.add_argument("-v", "--version", action="version", version=getVersion())
|
|
parser.add_argument(
|
|
"--cookies", metavar="cookies.txt", help="path to a cookies.txt file"
|
|
)
|
|
parser.add_argument(
|
|
"--delay", metavar="5", default=0.5, type=float, help="adds a delay (in seconds)"
|
|
)
|
|
parser.add_argument(
|
|
"--retries", metavar="5", default=5, help="Maximum number of retries for "
|
|
)
|
|
parser.add_argument("--path", help="path to store wiki dump at")
|
|
parser.add_argument(
|
|
"--resume",
|
|
action="store_true",
|
|
help="resumes previous incomplete dump (requires --path)",
|
|
)
|
|
parser.add_argument("--force", action="store_true", help="")
|
|
parser.add_argument("--user", help="Username if MedaiWiki authentication is required.")
|
|
parser.add_argument(
|
|
"--pass", dest="password", help="Password if MediaWiki authentication is required."
|
|
)
|
|
parser.add_argument(
|
|
"--http-user", dest="http_user", help="Username if HTTP authentication is required."
|
|
)
|
|
parser.add_argument(
|
|
"--http-pass", dest="http_password", help="Password if HTTP authentication is required."
|
|
)
|
|
parser.add_argument(
|
|
'--insecure', action='store_true', help='Disable SSL certificate verification'
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--stdout-log-file", dest="stdout_log_path", default=None, help="Path to copy stdout to",
|
|
)
|
|
|
|
# URL params
|
|
groupWikiOrAPIOrIndex = parser.add_argument_group()
|
|
groupWikiOrAPIOrIndex.add_argument(
|
|
"wiki", default="", nargs="?", help="URL to wiki (e.g. http://wiki.domain.org), auto detects API and index.php"
|
|
)
|
|
groupWikiOrAPIOrIndex.add_argument(
|
|
"--api", help="URL to API (e.g. http://wiki.domain.org/w/api.php)"
|
|
)
|
|
groupWikiOrAPIOrIndex.add_argument(
|
|
"--index", help="URL to index.php (e.g. http://wiki.domain.org/w/index.php), (not supported with --images on newer(?) MediaWiki without --api)"
|
|
)
|
|
|
|
# Download params
|
|
groupDownload = parser.add_argument_group(
|
|
"Data to download", "What info download from the wiki"
|
|
)
|
|
groupDownload.add_argument(
|
|
"--xml",
|
|
action="store_true",
|
|
help="Export XML dump using Special:Export (index.php). (supported with --curonly)",
|
|
)
|
|
groupDownload.add_argument(
|
|
"--curonly", action="store_true", help="store only the lastest revision of pages"
|
|
)
|
|
groupDownload.add_argument(
|
|
"--xmlapiexport",
|
|
action="store_true",
|
|
help="Export XML dump using API:revisions instead of Special:Export, use this when Special:Export fails and xmlrevisions not supported. (supported with --curonly)",
|
|
)
|
|
groupDownload.add_argument(
|
|
"--xmlrevisions",
|
|
action="store_true",
|
|
help="Export all revisions from an API generator (API:Allrevisions). MediaWiki 1.27+ only. (not supported with --curonly)",
|
|
)
|
|
groupDownload.add_argument(
|
|
"--xmlrevisions_page",
|
|
action="store_true",
|
|
help="[[! Development only !]] Export all revisions from an API generator, but query page by page MediaWiki 1.27+ only. (default: --curonly)",
|
|
)
|
|
groupDownload.add_argument(
|
|
"--images", action="store_true", help="Generates an image dump"
|
|
)
|
|
groupDownload.add_argument(
|
|
"--bypass-cdn-image-compression",
|
|
action="store_true",
|
|
help="Bypass CDN image compression. (CloudFlare Polish, etc.)",
|
|
)
|
|
groupDownload.add_argument(
|
|
"--namespaces",
|
|
metavar="1,2,3",
|
|
help="comma-separated value of namespaces to include (all by default)",
|
|
)
|
|
groupDownload.add_argument(
|
|
"--exnamespaces",
|
|
metavar="1,2,3",
|
|
help="comma-separated value of namespaces to exclude",
|
|
)
|
|
parser.add_argument(
|
|
"--api_chunksize", metavar="50", default=50, help="Chunk size for MediaWiki API (arvlimit, ailimit, etc.)"
|
|
)
|
|
|
|
# Meta info params
|
|
groupMeta = parser.add_argument_group(
|
|
"Meta info", "What meta info to retrieve from the wiki"
|
|
)
|
|
groupMeta.add_argument(
|
|
"--get-wiki-engine", action="store_true", help="returns the wiki engine"
|
|
)
|
|
groupMeta.add_argument(
|
|
"--failfast",
|
|
action="store_true",
|
|
help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.",
|
|
)
|
|
return parser
|
|
|
|
|
|
def checkParameters(args=argparse.Namespace()) -> bool:
|
|
|
|
passed = True
|
|
|
|
# Don't mix download params and meta info params
|
|
if (args.xml or args.images) and (args.get_wiki_engine):
|
|
print("ERROR: Don't mix download params and meta info params")
|
|
passed = False
|
|
|
|
# No download params and no meta info params? Exit
|
|
if (not args.xml and not args.images) and (not args.get_wiki_engine):
|
|
print("ERROR: Use at least one download param or meta info param")
|
|
passed = False
|
|
|
|
# Check user and pass (one requires both)
|
|
if (args.user and not args.password) or (args.password and not args.user):
|
|
print("ERROR: Both --user and --pass are required for authentication.")
|
|
passed = False
|
|
|
|
# Check http-user and http-pass (one requires both)
|
|
if (args.http_user and not args.http_password) or (args.http_password and not args.http_user):
|
|
print("ERROR: Both --http-user and --http-pass are required for authentication.")
|
|
passed = False
|
|
|
|
# --curonly requires --xml
|
|
if args.curonly and not args.xml:
|
|
print("ERROR: --curonly requires --xml")
|
|
passed = False
|
|
|
|
# --xmlrevisions not supported with --curonly
|
|
if args.xmlrevisions and args.curonly:
|
|
print("ERROR: --xmlrevisions not supported with --curonly")
|
|
passed = False
|
|
|
|
# Check URLs
|
|
for url in [args.api, args.index, args.wiki]:
|
|
if url and (not url.startswith("http://") and not url.startswith("https://")):
|
|
print(url)
|
|
print("ERROR: URLs must start with http:// or https://")
|
|
passed = False
|
|
|
|
return passed
|
|
|
|
def getParameters(params=None) -> Tuple[Config, Dict]:
|
|
# if not params:
|
|
# params = sys.argv
|
|
|
|
parser = getArgumentParser()
|
|
args = parser.parse_args(params)
|
|
if checkParameters(args) is not True:
|
|
print("\n\n")
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
# print (args)
|
|
|
|
########################################
|
|
|
|
# Create session
|
|
mod_requests_text(requests) # monkey patch
|
|
session = requests.Session()
|
|
|
|
# Disable SSL verification
|
|
if args.insecure:
|
|
session.verify = False
|
|
requests.packages.urllib3.disable_warnings()
|
|
print("WARNING: SSL certificate verification disabled")
|
|
|
|
# Custom session retry
|
|
try:
|
|
from requests.adapters import HTTPAdapter
|
|
from urllib3.util.retry import Retry
|
|
|
|
# Courtesy datashaman https://stackoverflow.com/a/35504626
|
|
class CustomRetry(Retry):
|
|
def increment(self, method=None, url=None, *args, **kwargs):
|
|
if '_pool' in kwargs:
|
|
conn = kwargs['_pool'] # type: urllib3.connectionpool.HTTPSConnectionPool
|
|
if 'response' in kwargs:
|
|
try:
|
|
# drain conn in advance so that it won't be put back into conn.pool
|
|
kwargs['response'].drain_conn()
|
|
except:
|
|
pass
|
|
# Useless, retry happens inside urllib3
|
|
# for adapters in session.adapters.values():
|
|
# adapters: HTTPAdapter
|
|
# adapters.poolmanager.clear()
|
|
|
|
# Close existing connection so that a new connection will be used
|
|
if hasattr(conn, 'pool'):
|
|
pool = conn.pool # type: queue.Queue
|
|
try:
|
|
# Don't directly use this, This closes connection pool by making conn.pool = None
|
|
conn.close()
|
|
except:
|
|
pass
|
|
conn.pool = pool
|
|
return super(CustomRetry, self).increment(method=method, url=url, *args, **kwargs)
|
|
|
|
def sleep(self, response=None):
|
|
backoff = self.get_backoff_time()
|
|
if backoff <= 0:
|
|
return
|
|
if response is not None:
|
|
msg = 'req retry (%s)' % response.status
|
|
else:
|
|
msg = None
|
|
Delay(config=None, session=session, msg=msg, delay=backoff)
|
|
|
|
__retries__ = CustomRetry(
|
|
total=int(args.retries), backoff_factor=0.3,
|
|
status_forcelist=[500, 502, 503, 504, 429],
|
|
allowed_methods=['DELETE', 'PUT', 'GET', 'OPTIONS', 'TRACE', 'HEAD', 'POST']
|
|
)
|
|
session.mount("https://", HTTPAdapter(max_retries=__retries__))
|
|
session.mount("http://", HTTPAdapter(max_retries=__retries__))
|
|
except:
|
|
# Our urllib3/requests is too old
|
|
pass
|
|
|
|
# Set cookies
|
|
cj = http.cookiejar.MozillaCookieJar()
|
|
if args.cookies:
|
|
cj.load(args.cookies)
|
|
print("Using cookies from %s" % args.cookies)
|
|
session.cookies = cj
|
|
|
|
# Setup user agent
|
|
session.headers.update({"User-Agent": getUserAgent()})
|
|
setupUserAgent(session) # monkey patch
|
|
|
|
# Set HTTP Basic Auth
|
|
if args.http_user and args.http_password:
|
|
session.auth = (args.user, args.password)
|
|
|
|
# Execute meta info params
|
|
if args.wiki:
|
|
if args.get_wiki_engine:
|
|
print(getWikiEngine(url=args.wiki, session=session))
|
|
sys.exit(0)
|
|
|
|
# Get API and index and verify
|
|
api = args.api if args.api else ""
|
|
index = args.index if args.index else ""
|
|
if api == "" or index == "":
|
|
if args.wiki:
|
|
if getWikiEngine(args.wiki, session=session) == "MediaWiki":
|
|
api2, index2 = mwGetAPIAndIndex(args.wiki, session=session)
|
|
if not api:
|
|
api = api2
|
|
if not index:
|
|
index = index2
|
|
else:
|
|
print("ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki")
|
|
sys.exit(1)
|
|
else:
|
|
if api == "":
|
|
pass
|
|
elif index == "":
|
|
index = "/".join(api.split("/")[:-1]) + "/index.php"
|
|
|
|
# print (api)
|
|
# print (index)
|
|
index2 = None
|
|
|
|
check, checkedapi = False, None
|
|
if api:
|
|
check, checkedapi = checkRetryAPI(
|
|
api=api,
|
|
apiclient=args.xmlrevisions,
|
|
session=session,
|
|
)
|
|
|
|
if api and check:
|
|
# Replace the index URL we got from the API check
|
|
index2 = check[1]
|
|
api = checkedapi
|
|
print("API is OK: ", checkedapi)
|
|
else:
|
|
if index and not args.wiki:
|
|
print("API not available. Trying with index.php only.")
|
|
args.api = None
|
|
else:
|
|
print("Error in API. Please, provide a correct path to API")
|
|
sys.exit(1)
|
|
|
|
# login if needed
|
|
# TODO: Re-login after session expires
|
|
if args.user and args.password:
|
|
_session = uniLogin(api=api, index=index, session=session, username=args.user, password=args.password)
|
|
if _session:
|
|
session = _session
|
|
print("-- Login OK --")
|
|
else:
|
|
print("-- Login failed --")
|
|
|
|
# check index
|
|
if index and checkIndex(index=index, cookies=args.cookies, session=session):
|
|
print("index.php is OK")
|
|
else:
|
|
index = index2
|
|
if index and index.startswith("//"):
|
|
index = args.wiki.split("//")[0] + index
|
|
if index and checkIndex(index=index, cookies=args.cookies, session=session):
|
|
print("index.php is OK")
|
|
else:
|
|
try:
|
|
index = "/".join(index.split("/")[:-1])
|
|
except AttributeError:
|
|
index = None
|
|
if index and checkIndex(index=index, cookies=args.cookies, session=session):
|
|
print("index.php is OK")
|
|
else:
|
|
print("Error in index.php.")
|
|
if not args.xmlrevisions:
|
|
print(
|
|
"Please, provide a correct path to index.php or use --xmlrevisions. Terminating."
|
|
)
|
|
sys.exit(1)
|
|
|
|
|
|
namespaces = ["all"]
|
|
exnamespaces = []
|
|
# Process namespace inclusions
|
|
if args.namespaces:
|
|
# fix, why - ? and... --namespaces= all with a space works?
|
|
if (
|
|
re.search(r"[^\d, \-]", args.namespaces)
|
|
and args.namespaces.lower() != "all"
|
|
):
|
|
print(
|
|
"Invalid namespace values.\nValid format is integer(s) separated by commas"
|
|
)
|
|
sys.exit()
|
|
else:
|
|
ns = re.sub(" ", "", args.namespaces)
|
|
if ns.lower() == "all":
|
|
namespaces = ["all"]
|
|
else:
|
|
namespaces = [int(i) for i in ns.split(",")]
|
|
|
|
# Process namespace exclusions
|
|
if args.exnamespaces:
|
|
if re.search(r"[^\d, \-]", args.exnamespaces):
|
|
print(
|
|
"Invalid namespace values.\nValid format is integer(s) separated by commas"
|
|
)
|
|
sys.exit(1)
|
|
else:
|
|
ns = re.sub(" ", "", args.exnamespaces)
|
|
if ns.lower() == "all":
|
|
print("You cannot exclude all namespaces.")
|
|
sys.exit(1)
|
|
else:
|
|
exnamespaces = [int(i) for i in ns.split(",")]
|
|
|
|
|
|
config = newConfig({
|
|
"curonly": args.curonly,
|
|
"date": datetime.datetime.now().strftime("%Y%m%d"),
|
|
"api": api,
|
|
"failfast": args.failfast,
|
|
"http_method": "POST",
|
|
"api_chunksize": int(args.api_chunksize),
|
|
"index": index,
|
|
"images": args.images,
|
|
"logs": False,
|
|
"xml": args.xml,
|
|
"xmlapiexport": args.xmlapiexport,
|
|
"xmlrevisions": args.xmlrevisions or args.xmlrevisions_page,
|
|
"xmlrevisions_page": args.xmlrevisions_page,
|
|
"namespaces": namespaces,
|
|
"exnamespaces": exnamespaces,
|
|
"path": args.path and os.path.normpath(args.path) or "",
|
|
"cookies": args.cookies or "",
|
|
"delay": args.delay,
|
|
"retries": int(args.retries),
|
|
})
|
|
|
|
other = {
|
|
"resume": args.resume,
|
|
"filenamelimit": 240, # Filename not be longer than 240 **bytes**. (MediaWiki r98430 2011-09-29)
|
|
"force": args.force,
|
|
"session": session,
|
|
"stdout_log_path": args.stdout_log_path,
|
|
"bypass_cdn_image_compression": args.bypass_cdn_image_compression,
|
|
}
|
|
|
|
# calculating path, if not defined by user with --path=
|
|
if not config.path:
|
|
config.path = "./{}-{}-wikidump".format(
|
|
domain2prefix(config=config, session=session),
|
|
config.date,
|
|
)
|
|
print("No --path argument provided. Defaulting to:")
|
|
print(" [working_directory]/[domain_prefix]-[date]-wikidump")
|
|
print("Which expands to:")
|
|
print(" " + config.path)
|
|
|
|
if config.delay == 0.5:
|
|
print("--delay is the default value of 0.5")
|
|
print(
|
|
"There will be a 0.5 second delay between HTTP calls in order to keep the server from timing you out."
|
|
)
|
|
print(
|
|
"If you know that this is unnecessary, you can manually specify '--delay 0.0'."
|
|
)
|
|
|
|
return config, other
|