You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
wikiteam/wikiteam3/dumpgenerator/cli/cli.py

454 lines
16 KiB
Python

import argparse
import datetime
import http
import http.cookiejar
import os
import queue
import re
import sys
import requests
import urllib3
from wikiteam3.dumpgenerator.api import checkRetryAPI, mwGetAPIAndIndex
from wikiteam3.utils.login import uniLogin
from .delay import Delay
from wikiteam3.utils import domain2prefix
from wikiteam3.dumpgenerator.api.index_check import checkIndex
from wikiteam3.utils import getUserAgent
from wikiteam3.dumpgenerator.version import getVersion
from wikiteam3.dumpgenerator.api import getWikiEngine
from wikiteam3.dumpgenerator.config import Config, newConfig
from wikiteam3.utils import mod_requests_text
from typing import *
from ...utils.user_agent import setupUserAgent
def getArgumentParser():
parser = argparse.ArgumentParser(description="")
# General params
parser.add_argument("-v", "--version", action="version", version=getVersion())
parser.add_argument(
"--cookies", metavar="cookies.txt", help="path to a cookies.txt file"
)
parser.add_argument(
"--delay", metavar="5", default=0.5, type=float, help="adds a delay (in seconds)"
)
parser.add_argument(
"--retries", metavar="5", default=5, help="Maximum number of retries for "
)
parser.add_argument("--path", help="path to store wiki dump at")
parser.add_argument(
"--resume",
action="store_true",
help="resumes previous incomplete dump (requires --path)",
)
parser.add_argument("--force", action="store_true", help="")
parser.add_argument("--user", help="Username if MedaiWiki authentication is required.")
parser.add_argument(
"--pass", dest="password", help="Password if MediaWiki authentication is required."
)
parser.add_argument(
"--http-user", dest="http_user", help="Username if HTTP authentication is required."
)
parser.add_argument(
"--http-pass", dest="http_password", help="Password if HTTP authentication is required."
)
parser.add_argument(
'--insecure', action='store_true', help='Disable SSL certificate verification'
)
parser.add_argument(
"--stdout-log-file", dest="stdout_log_path", default=None, help="Path to copy stdout to",
)
# URL params
groupWikiOrAPIOrIndex = parser.add_argument_group()
groupWikiOrAPIOrIndex.add_argument(
"wiki", default="", nargs="?", help="URL to wiki (e.g. http://wiki.domain.org), auto detects API and index.php"
)
groupWikiOrAPIOrIndex.add_argument(
"--api", help="URL to API (e.g. http://wiki.domain.org/w/api.php)"
)
groupWikiOrAPIOrIndex.add_argument(
"--index", help="URL to index.php (e.g. http://wiki.domain.org/w/index.php), (not supported with --images on newer(?) MediaWiki without --api)"
)
# Download params
groupDownload = parser.add_argument_group(
"Data to download", "What info download from the wiki"
)
groupDownload.add_argument(
"--xml",
action="store_true",
help="Export XML dump using Special:Export (index.php). (supported with --curonly)",
)
groupDownload.add_argument(
"--curonly", action="store_true", help="store only the lastest revision of pages"
)
groupDownload.add_argument(
"--xmlapiexport",
action="store_true",
help="Export XML dump using API:revisions instead of Special:Export, use this when Special:Export fails and xmlrevisions not supported. (supported with --curonly)",
)
groupDownload.add_argument(
"--xmlrevisions",
action="store_true",
help="Export all revisions from an API generator (API:Allrevisions). MediaWiki 1.27+ only. (not supported with --curonly)",
)
groupDownload.add_argument(
"--xmlrevisions_page",
action="store_true",
help="[[! Development only !]] Export all revisions from an API generator, but query page by page MediaWiki 1.27+ only. (default: --curonly)",
)
groupDownload.add_argument(
"--images", action="store_true", help="Generates an image dump"
)
groupDownload.add_argument(
"--bypass-cdn-image-compression",
action="store_true",
help="Bypass CDN image compression. (CloudFlare Polish, etc.)",
)
groupDownload.add_argument(
"--namespaces",
metavar="1,2,3",
help="comma-separated value of namespaces to include (all by default)",
)
groupDownload.add_argument(
"--exnamespaces",
metavar="1,2,3",
help="comma-separated value of namespaces to exclude",
)
parser.add_argument(
"--api_chunksize", metavar="50", default=50, help="Chunk size for MediaWiki API (arvlimit, ailimit, etc.)"
)
# Meta info params
groupMeta = parser.add_argument_group(
"Meta info", "What meta info to retrieve from the wiki"
)
groupMeta.add_argument(
"--get-wiki-engine", action="store_true", help="returns the wiki engine"
)
groupMeta.add_argument(
"--failfast",
action="store_true",
help="Avoid resuming, discard failing wikis quickly. Useful only for mass downloads.",
)
return parser
def checkParameters(args=argparse.Namespace()) -> bool:
passed = True
# Don't mix download params and meta info params
if (args.xml or args.images) and (args.get_wiki_engine):
print("ERROR: Don't mix download params and meta info params")
passed = False
# No download params and no meta info params? Exit
if (not args.xml and not args.images) and (not args.get_wiki_engine):
print("ERROR: Use at least one download param or meta info param")
passed = False
# Check user and pass (one requires both)
if (args.user and not args.password) or (args.password and not args.user):
print("ERROR: Both --user and --pass are required for authentication.")
passed = False
# Check http-user and http-pass (one requires both)
if (args.http_user and not args.http_password) or (args.http_password and not args.http_user):
print("ERROR: Both --http-user and --http-pass are required for authentication.")
passed = False
# --curonly requires --xml
if args.curonly and not args.xml:
print("ERROR: --curonly requires --xml")
passed = False
# --xmlrevisions not supported with --curonly
if args.xmlrevisions and args.curonly:
print("ERROR: --xmlrevisions not supported with --curonly")
passed = False
# Check URLs
for url in [args.api, args.index, args.wiki]:
if url and (not url.startswith("http://") and not url.startswith("https://")):
print(url)
print("ERROR: URLs must start with http:// or https://")
passed = False
return passed
def getParameters(params=None) -> Tuple[Config, Dict]:
# if not params:
# params = sys.argv
parser = getArgumentParser()
args = parser.parse_args(params)
if checkParameters(args) is not True:
print("\n\n")
parser.print_help()
sys.exit(1)
# print (args)
########################################
# Create session
mod_requests_text(requests) # monkey patch
session = requests.Session()
# Disable SSL verification
if args.insecure:
session.verify = False
requests.packages.urllib3.disable_warnings()
print("WARNING: SSL certificate verification disabled")
# Custom session retry
try:
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# Courtesy datashaman https://stackoverflow.com/a/35504626
class CustomRetry(Retry):
def increment(self, method=None, url=None, *args, **kwargs):
if '_pool' in kwargs:
conn = kwargs['_pool'] # type: urllib3.connectionpool.HTTPSConnectionPool
if 'response' in kwargs:
try:
# drain conn in advance so that it won't be put back into conn.pool
kwargs['response'].drain_conn()
except:
pass
# Useless, retry happens inside urllib3
# for adapters in session.adapters.values():
# adapters: HTTPAdapter
# adapters.poolmanager.clear()
# Close existing connection so that a new connection will be used
if hasattr(conn, 'pool'):
pool = conn.pool # type: queue.Queue
try:
# Don't directly use this, This closes connection pool by making conn.pool = None
conn.close()
except:
pass
conn.pool = pool
return super(CustomRetry, self).increment(method=method, url=url, *args, **kwargs)
def sleep(self, response=None):
backoff = self.get_backoff_time()
if backoff <= 0:
return
if response is not None:
msg = 'req retry (%s)' % response.status
else:
msg = None
Delay(config=None, session=session, msg=msg, delay=backoff)
__retries__ = CustomRetry(
total=int(args.retries), backoff_factor=0.3,
status_forcelist=[500, 502, 503, 504, 429],
allowed_methods=['DELETE', 'PUT', 'GET', 'OPTIONS', 'TRACE', 'HEAD', 'POST']
)
session.mount("https://", HTTPAdapter(max_retries=__retries__))
session.mount("http://", HTTPAdapter(max_retries=__retries__))
except:
# Our urllib3/requests is too old
pass
# Set cookies
cj = http.cookiejar.MozillaCookieJar()
if args.cookies:
cj.load(args.cookies)
print("Using cookies from %s" % args.cookies)
session.cookies = cj
# Setup user agent
session.headers.update({"User-Agent": getUserAgent()})
setupUserAgent(session) # monkey patch
# Set HTTP Basic Auth
if args.http_user and args.http_password:
session.auth = (args.user, args.password)
# Execute meta info params
if args.wiki:
if args.get_wiki_engine:
print(getWikiEngine(url=args.wiki, session=session))
sys.exit(0)
# Get API and index and verify
api = args.api if args.api else ""
index = args.index if args.index else ""
if api == "" or index == "":
if args.wiki:
if getWikiEngine(args.wiki, session=session) == "MediaWiki":
api2, index2 = mwGetAPIAndIndex(args.wiki, session=session)
if not api:
api = api2
if not index:
index = index2
else:
print("ERROR: Unsupported wiki. Wiki engines supported are: MediaWiki")
sys.exit(1)
else:
if api == "":
pass
elif index == "":
index = "/".join(api.split("/")[:-1]) + "/index.php"
# print (api)
# print (index)
index2 = None
check, checkedapi = False, None
if api:
check, checkedapi = checkRetryAPI(
api=api,
apiclient=args.xmlrevisions,
session=session,
)
if api and check:
# Replace the index URL we got from the API check
index2 = check[1]
api = checkedapi
print("API is OK: ", checkedapi)
else:
if index and not args.wiki:
print("API not available. Trying with index.php only.")
args.api = None
else:
print("Error in API. Please, provide a correct path to API")
sys.exit(1)
# login if needed
# TODO: Re-login after session expires
if args.user and args.password:
_session = uniLogin(api=api, index=index, session=session, username=args.user, password=args.password)
if _session:
session = _session
print("-- Login OK --")
else:
print("-- Login failed --")
# check index
if index and checkIndex(index=index, cookies=args.cookies, session=session):
print("index.php is OK")
else:
index = index2
if index and index.startswith("//"):
index = args.wiki.split("//")[0] + index
if index and checkIndex(index=index, cookies=args.cookies, session=session):
print("index.php is OK")
else:
try:
index = "/".join(index.split("/")[:-1])
except AttributeError:
index = None
if index and checkIndex(index=index, cookies=args.cookies, session=session):
print("index.php is OK")
else:
print("Error in index.php.")
if not args.xmlrevisions:
print(
"Please, provide a correct path to index.php or use --xmlrevisions. Terminating."
)
sys.exit(1)
namespaces = ["all"]
exnamespaces = []
# Process namespace inclusions
if args.namespaces:
# fix, why - ? and... --namespaces= all with a space works?
if (
re.search(r"[^\d, \-]", args.namespaces)
and args.namespaces.lower() != "all"
):
print(
"Invalid namespace values.\nValid format is integer(s) separated by commas"
)
sys.exit()
else:
ns = re.sub(" ", "", args.namespaces)
if ns.lower() == "all":
namespaces = ["all"]
else:
namespaces = [int(i) for i in ns.split(",")]
# Process namespace exclusions
if args.exnamespaces:
if re.search(r"[^\d, \-]", args.exnamespaces):
print(
"Invalid namespace values.\nValid format is integer(s) separated by commas"
)
sys.exit(1)
else:
ns = re.sub(" ", "", args.exnamespaces)
if ns.lower() == "all":
print("You cannot exclude all namespaces.")
sys.exit(1)
else:
exnamespaces = [int(i) for i in ns.split(",")]
config = newConfig({
"curonly": args.curonly,
"date": datetime.datetime.now().strftime("%Y%m%d"),
"api": api,
"failfast": args.failfast,
"http_method": "POST",
"api_chunksize": int(args.api_chunksize),
"index": index,
"images": args.images,
"logs": False,
"xml": args.xml,
"xmlapiexport": args.xmlapiexport,
"xmlrevisions": args.xmlrevisions or args.xmlrevisions_page,
"xmlrevisions_page": args.xmlrevisions_page,
"namespaces": namespaces,
"exnamespaces": exnamespaces,
"path": args.path and os.path.normpath(args.path) or "",
"cookies": args.cookies or "",
"delay": args.delay,
"retries": int(args.retries),
})
other = {
"resume": args.resume,
"filenamelimit": 240, # Filename not be longer than 240 **bytes**. (MediaWiki r98430 2011-09-29)
"force": args.force,
"session": session,
"stdout_log_path": args.stdout_log_path,
"bypass_cdn_image_compression": args.bypass_cdn_image_compression,
}
# calculating path, if not defined by user with --path=
if not config.path:
config.path = "./{}-{}-wikidump".format(
domain2prefix(config=config, session=session),
config.date,
)
print("No --path argument provided. Defaulting to:")
print(" [working_directory]/[domain_prefix]-[date]-wikidump")
print("Which expands to:")
print(" " + config.path)
if config.delay == 0.5:
print("--delay is the default value of 0.5")
print(
"There will be a 0.5 second delay between HTTP calls in order to keep the server from timing you out."
)
print(
"If you know that this is unnecessary, you can manually specify '--delay 0.0'."
)
return config, other